deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,341 @@
1
+ import math
2
+ import shutil
3
+ import tempfile
4
+ from typing import Dict, List, Iterator
5
+ import msgpack
6
+
7
+ import pytest
8
+ from pyarrow import RecordBatch
9
+
10
+ from deltacat.storage.rivulet.dataset import Dataset
11
+ from deltacat.storage.rivulet.fs.file_store import FileStore
12
+ from deltacat.storage.rivulet.metastore.delta import (
13
+ ManifestIO,
14
+ TreeLevel,
15
+ DeltacatManifestIO,
16
+ )
17
+
18
+ from deltacat.storage.rivulet.mvp.Table import MvpTable, MvpRow
19
+ from deltacat.storage.rivulet.reader.query_expression import QueryExpression
20
+ from deltacat.storage.rivulet import Schema
21
+ from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
22
+ MemtableDatasetWriter,
23
+ )
24
+
25
+ from deltacat.tests.storage.rivulet.test_utils import FIXTURE_ROW_COUNT
26
+ from deltacat.tests.storage.rivulet.test_utils import (
27
+ write_mvp_table,
28
+ compare_mvp_table_to_scan_results,
29
+ mvp_table_to_record_batches,
30
+ validate_with_full_scan,
31
+ assert_data_file_extension_set,
32
+ create_dataset_for_method,
33
+ )
34
+
35
+ MemtableDatasetWriter.MAX_ROW_SIZE = 100
36
+
37
+
38
+ class TestBasicEndToEnd:
39
+ temp_dir = None
40
+
41
+ @classmethod
42
+ def setup_class(cls):
43
+ cls.temp_dir = tempfile.mkdtemp()
44
+ cls.dataset: Dataset = Dataset(dataset_name="test", metadata_uri=cls.temp_dir)
45
+
46
+ @classmethod
47
+ def teardown_class(cls):
48
+ shutil.rmtree(cls.temp_dir)
49
+ pass
50
+
51
+ @pytest.fixture
52
+ def ds1_schema(self, ds1_schema: Schema, ds1_dataset: MvpTable):
53
+ self.dataset.add_schema(ds1_schema, "ds1_schema")
54
+ with self.dataset.writer("ds1_schema") as writer:
55
+ write_mvp_table(writer, ds1_dataset)
56
+ return ds1_schema
57
+
58
+ def test_end_to_end_scan_pydict(self, ds1_schema, ds1_dataset):
59
+ # Read out dataset written to ds1_schema fixture, with full scan
60
+ read_records: List[Dict] = list(
61
+ self.dataset.scan(QueryExpression()).to_pydict()
62
+ ) # compare all_records to ds1
63
+ compare_mvp_table_to_scan_results(
64
+ ds1_dataset, read_records, ds1_schema.get_merge_key()
65
+ )
66
+
67
+ def test_end_to_end_scan_key_range(self, ds1_schema, ds1_dataset):
68
+ read_records_range: List[Dict] = list(
69
+ self.dataset.scan(QueryExpression().with_range(100, 500)).to_pydict()
70
+ )
71
+ assert len(read_records_range) == 401
72
+
73
+ def test_end_to_end_scan_single_key(self, ds1_schema, ds1_dataset):
74
+ read_records_single_key: List[Dict] = list(
75
+ self.dataset.scan(QueryExpression().with_key(600)).to_pydict()
76
+ )
77
+ assert len(read_records_single_key) == 1
78
+ assert read_records_single_key[0]["id"] == 600
79
+
80
+ def test_end_to_end_scan_pyarrow(self, ds1_schema, ds1_dataset):
81
+ batches: Iterator[RecordBatch] = self.dataset.scan(QueryExpression()).to_arrow()
82
+ read_records = [record for batch in batches for record in batch.to_pylist()]
83
+ compare_mvp_table_to_scan_results(
84
+ ds1_dataset, read_records, ds1_schema.get_merge_key()
85
+ )
86
+
87
+
88
+ class TestMultiLayerCompactionEndToEnd:
89
+ """Tests the merge-on-read compaction
90
+
91
+ The priority of records with the same primary key should go as follows:
92
+
93
+ 1. Prioritize higher layers over lower layers
94
+ 1. Prioritize newer SSTs over older SSTs (which is really only relevant for L0)
95
+
96
+ To this end, we'll create 4 manifests (in order of oldest to newest):
97
+ 1. L0 manifest A (oldest perhaps because of compaction) with ids {x}
98
+ 1. L1 manifest B with ids {x} and {y}
99
+ 1. L1 manifest C with ids {y}
100
+ 1. L2 manifest D with ids {y} (technically not required for this demonstration)
101
+
102
+ The output dataset should contain:
103
+ - {x} from manifest A (since it's at a higher layer than manifest B)
104
+ - {y} from manifest C (since it's newer than manifest B)
105
+ """
106
+
107
+ temp_dir = None
108
+ file_store: FileStore
109
+ manifest_io: ManifestIO
110
+
111
+ @classmethod
112
+ def setup_class(cls):
113
+ cls.temp_dir = tempfile.mkdtemp()
114
+ path, filesystem = FileStore.filesystem(cls.temp_dir)
115
+ cls.dataset: Dataset = Dataset(dataset_name="test", metadata_uri=path)
116
+ cls.file_store = cls.dataset._file_store
117
+ cls.manifest_io = DeltacatManifestIO(cls.temp_dir, cls.dataset._locator)
118
+
119
+ @classmethod
120
+ def teardown_class(cls):
121
+ shutil.rmtree(cls.temp_dir)
122
+
123
+ @pytest.fixture
124
+ def l0_overwrite(self, ds1_dataset):
125
+ """Transform the 2nd half of the records"""
126
+ return self._transform_dataset(
127
+ ds1_dataset,
128
+ math.floor(FIXTURE_ROW_COUNT / 2),
129
+ FIXTURE_ROW_COUNT,
130
+ transform_name=lambda x: "overwritten",
131
+ transform_age=lambda x: None,
132
+ )
133
+
134
+ @pytest.fixture
135
+ def l1_overwrite(self, ds1_dataset):
136
+ """Transform the 1st half of the records"""
137
+ return self._transform_dataset(
138
+ ds1_dataset,
139
+ 0,
140
+ math.floor(FIXTURE_ROW_COUNT / 2),
141
+ transform_name=lambda x: "overwritten",
142
+ )
143
+
144
+ @pytest.fixture
145
+ def l2_ignored(self, ds1_dataset):
146
+ """Transform the 1st half of the records"""
147
+ return self._transform_dataset(
148
+ ds1_dataset,
149
+ 0,
150
+ math.floor(FIXTURE_ROW_COUNT / 2),
151
+ transform_name=lambda x: "ignored",
152
+ )
153
+
154
+ def _transform_dataset(
155
+ self,
156
+ dataset,
157
+ min_index=0,
158
+ max_index=FIXTURE_ROW_COUNT,
159
+ transform_id=lambda x: x,
160
+ transform_name=lambda x: x,
161
+ transform_age=lambda x: x,
162
+ ):
163
+ data = dataset.data
164
+ return MvpTable(
165
+ {
166
+ "id": [transform_id(x) for x in data["id"][min_index:max_index]],
167
+ "name": [transform_name(x) for x in data["name"][min_index:max_index]],
168
+ "age": [transform_age(x) for x in data["age"][min_index:max_index]],
169
+ }
170
+ )
171
+
172
+ @pytest.fixture
173
+ def expected_dataset(self, l1_overwrite, l0_overwrite):
174
+ return MvpTable(
175
+ {
176
+ "id": l1_overwrite.data["id"] + l0_overwrite.data["id"],
177
+ "name": l1_overwrite.data["name"] + l0_overwrite.data["name"],
178
+ "age": l1_overwrite.data["age"] + l0_overwrite.data["age"],
179
+ }
180
+ )
181
+
182
+ @pytest.fixture
183
+ def ds1_written_uri(
184
+ self, ds1_schema, ds1_dataset, l2_ignored, l0_overwrite, l1_overwrite
185
+ ):
186
+ print(f"Writing test data to directory {self.temp_dir}")
187
+ self.dataset.add_schema(ds1_schema, "ds1_schema")
188
+ # oldest at L0 (should take precedence)
189
+ self.write_dataset("ds1_schema", l0_overwrite)
190
+ # original dataset (at L1)
191
+ uri = self.write_dataset("ds1_schema", ds1_dataset)
192
+ self.rewrite_at_level(uri, 1)
193
+ # newer dataset at L1 (should take precedence)
194
+ uri = self.write_dataset("ds1_schema", l1_overwrite)
195
+ self.rewrite_at_level(uri, 1)
196
+ # newer at L2 (loses out to L0 data)
197
+ uri = self.write_dataset("ds1_schema", l2_ignored)
198
+ self.rewrite_at_level(uri, 2)
199
+
200
+ def test_end_to_end_scan(self, ds1_written_uri, ds1_schema, expected_dataset):
201
+ """Rewrite entire dataset into 2nd manifest with same primary keys but "redacted" name."""
202
+ read_records: List[Dict] = list(
203
+ self.dataset.scan(QueryExpression()).to_pydict()
204
+ )
205
+ key = ds1_schema.get_merge_key()
206
+ rows_by_key: Dict[str, MvpRow] = expected_dataset.to_rows_by_key(key)
207
+ assert len(read_records) == len(rows_by_key)
208
+ for record in read_records:
209
+ pk_val = record[key]
210
+ assert record == rows_by_key[pk_val].data
211
+
212
+ # Test scan primary key range
213
+ read_records_range: List[Dict] = list(
214
+ self.dataset.scan(QueryExpression().with_range(100, 500)).to_pydict()
215
+ )
216
+ assert len(read_records_range) == 401
217
+
218
+ # Test scan single primary key
219
+ read_records_single_key: List[Dict] = list(
220
+ self.dataset.scan(QueryExpression().with_key(600)).to_pydict()
221
+ )
222
+ assert len(read_records_single_key) == 1
223
+ assert read_records_single_key[0]["id"] == 600
224
+
225
+ def write_dataset(self, schema_name: str, dataset) -> str:
226
+ ds1_writer = self.dataset.writer(schema_name)
227
+ write_mvp_table(ds1_writer, dataset)
228
+ return ds1_writer.flush()
229
+
230
+ def rewrite_at_level(self, uri: str, level: TreeLevel):
231
+ """Rewrite the given manifest with the new tree level
232
+
233
+ TODO: replace this with a compaction operation
234
+ """
235
+ with open(uri, "rb") as f:
236
+ data = msgpack.unpack(f)
237
+ data["level"] = level
238
+
239
+ with open(uri, "wb") as f:
240
+ msgpack.pack(data, f)
241
+
242
+
243
+ class TestZipperMergeEndToEnd:
244
+ temp_dir = None
245
+ file_store: FileStore
246
+
247
+ @classmethod
248
+ def setup_class(cls):
249
+ cls.temp_dir = tempfile.mkdtemp()
250
+ path, filesystem = FileStore.filesystem(cls.temp_dir)
251
+ cls.dataset: Dataset = Dataset(dataset_name="test", metadata_uri=cls.temp_dir)
252
+ cls.file_store = FileStore(path, filesystem=filesystem)
253
+
254
+ @classmethod
255
+ def teardown_class(cls):
256
+ shutil.rmtree(cls.temp_dir)
257
+
258
+ @pytest.fixture
259
+ def schema1(self, ds1_dataset: MvpTable, ds1_schema: Schema):
260
+ self.dataset.add_schema(ds1_schema, "ds1_schema")
261
+ with self.dataset.writer("ds1_schema") as writer:
262
+ write_mvp_table(writer, ds1_dataset)
263
+ return ds1_schema
264
+
265
+ @pytest.fixture
266
+ def schema2(self, ds2_dataset: MvpTable, ds2_schema: Schema):
267
+ self.dataset.add_schema(ds2_schema, "ds2_schema")
268
+ with self.dataset.writer("ds2_schema") as writer:
269
+ write_mvp_table(writer, ds2_dataset)
270
+ return ds2_schema
271
+
272
+ def test_end_to_end_scan(
273
+ self,
274
+ schema1,
275
+ schema2,
276
+ ds1_schema,
277
+ ds1_dataset,
278
+ ds2_dataset,
279
+ ds2_schema,
280
+ combined_schema,
281
+ ):
282
+ read_records: List[Dict] = list(
283
+ self.dataset.scan(QueryExpression()).to_pydict()
284
+ )
285
+
286
+ merge_key = ds1_schema.get_merge_key()
287
+ ds1_rows_by_pk: Dict[str, MvpRow] = ds1_dataset.to_rows_by_key(merge_key)
288
+ ds2_rows_by_pk: Dict[str, MvpRow] = ds2_dataset.to_rows_by_key(merge_key)
289
+
290
+ assert len(read_records) == len(ds1_rows_by_pk)
291
+ for record in read_records:
292
+ pk_val = record[merge_key]
293
+ ds1_row = ds1_rows_by_pk[pk_val]
294
+ ds2_row = ds2_rows_by_pk[pk_val]
295
+ expected_merged_record = ds1_row.data | ds2_row.data
296
+ assert expected_merged_record == record
297
+
298
+ # Test scan primary key range
299
+ read_records_range: List[Dict] = list(
300
+ self.dataset.scan(QueryExpression().with_range(100, 500)).to_pydict()
301
+ )
302
+ assert len(read_records_range) == 401
303
+
304
+ # Test scan single primary key
305
+ read_records_single_key: List[Dict] = list(
306
+ self.dataset.scan(QueryExpression().with_key(600)).to_pydict()
307
+ )
308
+ assert len(read_records_single_key) == 1
309
+ assert read_records_single_key[0]["id"] == 600
310
+
311
+
312
+ class TestDataFormatSupport:
313
+ temp_dir = None
314
+ file_store: FileStore
315
+
316
+ @classmethod
317
+ def setup_class(cls):
318
+ cls.temp_dir = tempfile.mkdtemp()
319
+ path, filesystem = FileStore.filesystem(cls.temp_dir)
320
+ cls.file_store = FileStore(path, filesystem=filesystem)
321
+
322
+ @classmethod
323
+ def teardown_class(cls):
324
+ shutil.rmtree(cls.temp_dir)
325
+ pass
326
+
327
+ # TODO expand coverage - below test is more like smoke test since dataset rows the same across types
328
+ def test_mixed_content_dataset(self, dataset_images_with_label):
329
+ dataset = create_dataset_for_method(self.temp_dir)
330
+ table, schema = dataset_images_with_label
331
+ dataset.add_schema(schema, "schema")
332
+ with dataset.writer("schema", "feather") as writer:
333
+ record_batch = mvp_table_to_record_batches(table, schema)
334
+ writer.write([record_batch])
335
+
336
+ with dataset.writer("schema", "parquet") as writer:
337
+ record_batch = mvp_table_to_record_batches(table, schema)
338
+ writer.write([record_batch])
339
+
340
+ validate_with_full_scan(dataset, table, schema)
341
+ assert_data_file_extension_set(dataset, {".feather", ".parquet"})
@@ -0,0 +1,79 @@
1
+ import pytest
2
+ import shutil
3
+ import tempfile
4
+
5
+ from deltacat.storage.rivulet.fs.file_store import FileStore
6
+ from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
7
+ MemtableDatasetWriter,
8
+ )
9
+ from ..test_utils import (
10
+ write_mvp_table,
11
+ mvp_table_to_record_batches,
12
+ validate_with_full_scan,
13
+ create_dataset_for_method,
14
+ assert_data_file_extension,
15
+ )
16
+
17
+ MemtableDatasetWriter.MAX_ROW_SIZE = 100
18
+
19
+
20
+ class TestWriter:
21
+ temp_dir = None
22
+ file_store: FileStore
23
+
24
+ @classmethod
25
+ def setup_class(cls):
26
+ cls.temp_dir = tempfile.mkdtemp()
27
+ path, filesystem = FileStore.filesystem(cls.temp_dir)
28
+ cls.file_store = FileStore(path, filesystem)
29
+
30
+ @classmethod
31
+ def teardown_class(cls):
32
+ shutil.rmtree(cls.temp_dir)
33
+ pass
34
+
35
+ def test_write_unsupported_data_type(self, ds1_dataset, ds1_schema):
36
+ dataset = create_dataset_for_method(self.temp_dir)
37
+ dataset.add_schema(ds1_schema, "ds1_schema")
38
+ with dataset.writer("ds1_schema") as writer:
39
+ with pytest.raises(ValueError):
40
+ writer.write("a string")
41
+
42
+ def test_write_pydict(self, ds1_dataset, ds1_schema):
43
+ dataset = create_dataset_for_method(self.temp_dir)
44
+ dataset.add_schema(ds1_schema, "ds1_schema")
45
+ with dataset.writer("ds1_schema") as writer:
46
+ write_mvp_table(writer, ds1_dataset)
47
+
48
+ validate_with_full_scan(dataset, ds1_dataset, ds1_schema)
49
+
50
+ def test_write_record_batch(self, ds1_dataset, ds1_schema):
51
+ dataset = create_dataset_for_method(self.temp_dir)
52
+ dataset.add_schema(ds1_schema, "ds1_schema")
53
+ with dataset.writer("ds1_schema") as writer:
54
+ record_batch = mvp_table_to_record_batches(ds1_dataset, ds1_schema)
55
+ writer.write(record_batch)
56
+
57
+ validate_with_full_scan(dataset, ds1_dataset, ds1_schema)
58
+
59
+ def test_write_list_of_record_batch(self, ds1_dataset, ds1_schema):
60
+ dataset = create_dataset_for_method(self.temp_dir)
61
+ dataset.add_schema(ds1_schema, "ds1_schema")
62
+ with dataset.writer("ds1_schema", "feather") as writer:
63
+ record_batch = mvp_table_to_record_batches(ds1_dataset, ds1_schema)
64
+ writer.write([record_batch])
65
+
66
+ validate_with_full_scan(dataset, ds1_dataset, ds1_schema)
67
+ assert_data_file_extension(dataset, ".feather")
68
+
69
+ def test_write_feather(self, dataset_images_with_label):
70
+ dataset = create_dataset_for_method(self.temp_dir)
71
+
72
+ table, schema = dataset_images_with_label
73
+ dataset.add_schema(schema, "schema")
74
+ with dataset.writer("schema", "feather") as writer:
75
+ record_batch = mvp_table_to_record_batches(table, schema)
76
+ writer.write([record_batch])
77
+
78
+ validate_with_full_scan(dataset, table, schema)
79
+ assert_data_file_extension(dataset, "feather")
@@ -0,0 +1,75 @@
1
+ import pytest
2
+
3
+ from deltacat import Dataset
4
+ from deltacat.storage.rivulet.fs.file_provider import FileProvider
5
+ from deltacat.storage.rivulet.fs.file_store import FileStore
6
+ from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
7
+ from deltacat.storage.rivulet import Schema
8
+ from deltacat.storage.rivulet.schema.datatype import Datatype
9
+ from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
10
+ MemtableDatasetWriter,
11
+ )
12
+
13
+
14
+ @pytest.fixture
15
+ def test_schema():
16
+ return Schema(
17
+ fields=[
18
+ ("id", Datatype.int32()),
19
+ ("name", Datatype.string()),
20
+ ],
21
+ merge_keys="id",
22
+ )
23
+
24
+
25
+ @pytest.fixture
26
+ def resolve_path_and_filesystem(tmp_path):
27
+ return FileStore.filesystem(tmp_path)
28
+
29
+
30
+ @pytest.fixture
31
+ def file_provider(resolve_path_and_filesystem):
32
+ path, filesystem = resolve_path_and_filesystem
33
+ file_store = FileStore(path, filesystem)
34
+ return FileProvider(path, file_store)
35
+
36
+
37
+ @pytest.fixture
38
+ def file_store(resolve_path_and_filesystem):
39
+ path, filesystem = resolve_path_and_filesystem
40
+ return FileStore(path, filesystem=filesystem)
41
+
42
+
43
+ def test_write_after_flush(tmp_path, test_schema):
44
+ dataset = Dataset(metadata_uri=tmp_path, dataset_name="dataset")
45
+ file_store = dataset._file_store
46
+ writer = MemtableDatasetWriter(
47
+ file_provider=dataset._file_provider,
48
+ schema=test_schema,
49
+ locator=dataset._locator,
50
+ )
51
+ writer.write_dict({"id": 100, "name": "alpha"})
52
+ manifest_uri_1 = writer.flush()
53
+
54
+ manifest_io = DeltacatManifestIO(writer.file_provider.uri, dataset._locator)
55
+ manifest_1 = manifest_io.read(manifest_uri_1)
56
+ sst_files_1 = manifest_1.sst_files
57
+
58
+ assert len(sst_files_1) > 0, "First flush: no SST files found."
59
+ assert manifest_1.context.schema == writer.schema, "Schema mismatch in first flush."
60
+
61
+ writer.write_dict({"id": 200, "name": "gamma"})
62
+ manifest_uri_2 = writer.flush()
63
+
64
+ manifest_2 = manifest_io.read(file_store.create_input_file(manifest_uri_2).location)
65
+ sst_files_2 = manifest_2.sst_files
66
+
67
+ assert len(sst_files_2) > 0, "Second flush: no SST files found."
68
+
69
+ # ensures data_files and sst_files from first write are not included in second write.
70
+ assert set(sst_files_1).isdisjoint(
71
+ set(sst_files_2)
72
+ ), "Expected no overlap of SST files between first and second flush."
73
+ assert (
74
+ manifest_2.context.schema == writer.schema
75
+ ), "Schema mismatch in second flush."
@@ -0,0 +1,39 @@
1
+ import shutil
2
+ import tempfile
3
+ import deltacat as dc
4
+
5
+
6
+ class TestDeltaCAT:
7
+ @classmethod
8
+ def setup_class(cls):
9
+ cls.temp_dir_1 = tempfile.mkdtemp()
10
+ cls.temp_dir_2 = tempfile.mkdtemp()
11
+ # Initialize DeltaCAT with two local catalogs.
12
+ dc.put("test_catalog_1", root=cls.temp_dir_1)
13
+ dc.put("test_catalog_2", root=cls.temp_dir_2)
14
+
15
+ @classmethod
16
+ def teardown_class(cls):
17
+ shutil.rmtree(cls.temp_dir_1)
18
+ shutil.rmtree(cls.temp_dir_2)
19
+
20
+ def test_cross_catalog_namespace_copy(self):
21
+ # Given two empty DeltaCAT catalogs.
22
+ # When a namespace is copied across catalogs.
23
+ namespace_src = dc.put("test_catalog_1/test_namespace")
24
+ namespace_dst = dc.copy(
25
+ "test_catalog_1/test_namespace",
26
+ "test_catalog_2",
27
+ )
28
+ # Expect the catalog namespace created in each catalog
29
+ # method to be equivalent and equal to the source namespace.
30
+ assert namespace_src.equivalent_to(namespace_dst)
31
+ assert namespace_src == namespace_dst
32
+
33
+ # When each catalog namespace is fetched explicitly
34
+ # Expect them to be equivalent but not equal
35
+ # (due to different metafile IDs).
36
+ actual_namespace_src = dc.get("test_catalog_1/test_namespace")
37
+ actual_namespace_dst = dc.get("test_catalog_2/test_namespace")
38
+ assert actual_namespace_src.equivalent_to(actual_namespace_dst)
39
+ assert not actual_namespace_src == actual_namespace_dst
@@ -0,0 +1,14 @@
1
+ import shutil
2
+ import tempfile
3
+
4
+
5
+ from contextlib import contextmanager
6
+
7
+
8
+ @contextmanager
9
+ def temp_dir_autocleanup():
10
+ tmpdir = tempfile.mkdtemp()
11
+ try:
12
+ yield tmpdir
13
+ finally:
14
+ shutil.rmtree(tmpdir)
@@ -0,0 +1,54 @@
1
+ import base64
2
+ import msgpack
3
+ import json
4
+ import os
5
+ import shutil
6
+
7
+ from tempfile import mkdtemp
8
+
9
+
10
+ def _convert_bytes_to_base64_str(obj):
11
+ if isinstance(obj, dict):
12
+ for key, value in obj.items():
13
+ if isinstance(value, bytes):
14
+ obj[key] = base64.b64encode(value).decode("utf-8")
15
+ elif isinstance(value, list):
16
+ _convert_bytes_to_base64_str(value)
17
+ elif isinstance(value, dict):
18
+ _convert_bytes_to_base64_str(value)
19
+ elif isinstance(obj, list):
20
+ for i, item in enumerate(obj):
21
+ if isinstance(item, bytes):
22
+ obj[i] = base64.b64encode(item).decode("utf-8")
23
+ elif isinstance(item, (dict, list)):
24
+ _convert_bytes_to_base64_str(item)
25
+
26
+
27
+ def copy_and_convert(src_dir, dst_dir=None):
28
+ """
29
+ Helper function for copying a metastore recursively and converting all
30
+ messagepack files to json. This can be used manually to more easily
31
+ introspect metastore metadata.
32
+ """
33
+ if dst_dir is None:
34
+ dst_dir = mkdtemp()
35
+ print(f"destination is: {dst_dir}")
36
+ if not os.path.exists(dst_dir):
37
+ os.makedirs(dst_dir)
38
+
39
+ for item in os.listdir(src_dir):
40
+ src_path = os.path.join(src_dir, item)
41
+ dst_path = os.path.join(dst_dir, item)
42
+
43
+ if os.path.isdir(src_path):
44
+ copy_and_convert(src_path, dst_path)
45
+ else:
46
+ if item.endswith(".mpk"):
47
+ with open(src_path, "rb") as f:
48
+ data = msgpack.unpackb(f.read(), raw=False)
49
+ _convert_bytes_to_base64_str(data)
50
+ dst_path = dst_path[:-4] + ".json"
51
+ with open(dst_path, "w") as f:
52
+ json.dump(data, f)
53
+ else:
54
+ shutil.copy2(src_path, dst_path)
@@ -47,8 +47,7 @@ def stage_partition_from_file_paths(
47
47
 
48
48
  def commit_delta_to_staged_partition(
49
49
  staged_partition,
50
- file_paths: List[str] = None,
51
- pa_table: pa.Table = None,
50
+ file_paths: List[str],
52
51
  content_type: ContentType = ContentType.PARQUET,
53
52
  *args,
54
53
  **kwargs,
@@ -58,7 +57,6 @@ def commit_delta_to_staged_partition(
58
57
  *args,
59
58
  file_paths=file_paths,
60
59
  content_type=content_type,
61
- pa_table=pa_table,
62
60
  **kwargs,
63
61
  )
64
62
  ds.commit_partition(staged_partition, **kwargs)
@@ -78,28 +76,23 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
78
76
 
79
77
  def commit_delta_to_partition(
80
78
  partition: Union[Partition, PartitionLocator],
81
- file_paths: List[str] = None,
82
- pa_table: pa.Table = None,
79
+ file_paths: List[str],
83
80
  content_type: ContentType = ContentType.PARQUET,
84
81
  *args,
85
82
  **kwargs,
86
83
  ) -> Delta:
84
+ tables = []
87
85
 
88
86
  if isinstance(partition, PartitionLocator):
89
87
  partition = ds.get_partition(
90
88
  partition.stream_locator, partition.partition_values, *args, **kwargs
91
89
  )
92
- if pa_table is None:
93
- assert file_paths is not None, "One of pa_table or file_paths must be passed."
94
- tables = []
95
- for file_path in file_paths:
96
- table = pa.csv.read_csv(file_path)
97
- tables.append(table)
98
90
 
99
- pa_table = pa.concat_tables(tables)
91
+ for file_path in file_paths:
92
+ table = pa.csv.read_csv(file_path)
93
+ tables.append(table)
100
94
 
101
- staged_delta = ds.stage_delta(
102
- pa_table, partition, content_type=content_type, **kwargs
103
- )
95
+ table = pa.concat_tables(tables)
96
+ staged_delta = ds.stage_delta(table, partition, content_type=content_type, **kwargs)
104
97
 
105
98
  return ds.commit_delta(staged_delta, **kwargs)