deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,105 @@
1
+ from dataclasses import dataclass
2
+ from typing import List, Callable, Any, Protocol
3
+
4
+ from deltacat.storage.rivulet.dataset_executor import DatasetExecutor
5
+ from deltacat.storage.rivulet.mvp.Table import MvpTable
6
+ from deltacat.storage.rivulet import Schema
7
+
8
+
9
+ class DatasetOperation(Protocol):
10
+ def visit(self, executor: DatasetExecutor):
11
+ ...
12
+
13
+
14
+ @dataclass
15
+ class SelectOperation(DatasetOperation):
16
+ """
17
+ Select a subset of fields within the schema
18
+
19
+ TODO need better interface for defining selection
20
+ (e.g. "all fields except X")
21
+
22
+ TODO in the future this should support basic filters (e.g. on primary key)
23
+ """
24
+
25
+ fields: List[str]
26
+
27
+ def visit(self, executor: DatasetExecutor):
28
+ executor.select(self.fields)
29
+
30
+
31
+ @dataclass
32
+ class MapOperation(DatasetOperation):
33
+ """
34
+ Map a function over each record in the dataset
35
+
36
+ TODO need more sophistication in the interface of the callable function
37
+ For now we will be super simple and just call the transform on each record
38
+ """
39
+
40
+ transform: Callable[[Any], Any]
41
+
42
+ def visit(self, executor: DatasetExecutor):
43
+ executor.map(self.transform)
44
+
45
+
46
+ class CollectOperation(DatasetOperation):
47
+ """
48
+ Materialize dataset
49
+ """
50
+
51
+ def visit(self, executor: DatasetExecutor):
52
+ executor.collect()
53
+
54
+
55
+ class LogicalPlan:
56
+ """
57
+ A fluent builder for constructing a sequence of dataset operations.
58
+
59
+ This class allows chaining of different dataset operations such as select and map.
60
+ The actual implementation of these operations is delegated to the Dataset class
61
+ using the visitor pattern.
62
+
63
+ Example:
64
+ plan = LogicalPlan().select(lambda x: x['age'] > 30).map(lambda x: x['name'])
65
+
66
+ The plan can then be executed on a Dataset object, which will apply the
67
+ operations in the order they were added.
68
+ """
69
+
70
+ def __init__(self, schema: Schema):
71
+ self.operations: List[DatasetOperation] = []
72
+ self.schema = schema
73
+ # Tracks effective schema to perform each operation on
74
+ self.effective_schema: Schema = schema.__deepcopy__()
75
+
76
+ def select(self, filter: List[str]) -> "LogicalPlan":
77
+ # Validate that select statement is allowed and mutate effective schema for future validations
78
+ invalid_fields = [
79
+ field for field in filter if field not in self.effective_schema.fields
80
+ ]
81
+ if invalid_fields:
82
+ raise ValueError(f"Invalid fields: {', '.join(invalid_fields)}")
83
+
84
+ # remove fields from effective schema if they are not in chosen fields
85
+ remove_fields = [
86
+ field for field in self.effective_schema.keys() if field not in filter
87
+ ]
88
+ for field in remove_fields:
89
+ self.effective_schema.__delitem__(field)
90
+
91
+ self.operations.append(SelectOperation(filter))
92
+ return self
93
+
94
+ def map(self, transform: Callable[[dict], dict]) -> "LogicalPlan":
95
+ self.operations.append(MapOperation(transform))
96
+ return self
97
+
98
+ def collect(self) -> "LogicalPlan":
99
+ self.operations.append(CollectOperation())
100
+ return self
101
+
102
+ def execute(self, executor: DatasetExecutor) -> "MvpTable":
103
+ for operation in self.operations:
104
+ operation.visit(executor)
105
+ return executor.output
File without changes
@@ -0,0 +1,190 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol, NamedTuple, List
4
+ import time
5
+
6
+ from deltacat.storage import (
7
+ ManifestMeta,
8
+ EntryType,
9
+ DeltaLocator,
10
+ Delta,
11
+ DeltaType,
12
+ Transaction,
13
+ TransactionType,
14
+ TransactionOperation,
15
+ TransactionOperationType,
16
+ )
17
+ from deltacat.storage.model.manifest import Manifest, ManifestEntryList, ManifestEntry
18
+ from deltacat.storage.model.partition import PartitionLocator
19
+ from deltacat.storage.model.transaction import TransactionOperationList
20
+
21
+ from deltacat.storage.model.types import StreamFormat
22
+ from deltacat.storage.rivulet import Schema
23
+
24
+ StreamPosition = int
25
+ """The stream position for creating a consistent ordering of manifests."""
26
+ TreeLevel = int
27
+ """The level of the manifest in the LSM-tree."""
28
+
29
+
30
+ class DeltaContext(NamedTuple):
31
+ """Minimal amount of manifest context that may need to be circulated independently or alongside individual files"""
32
+
33
+ # Schema needed to understand which field group was added when writing manifest
34
+ # TODO in the future we should use something like a field group id and keep schema in dataset-level metadata
35
+ schema: Schema
36
+ stream_position: StreamPosition
37
+ level: TreeLevel
38
+
39
+
40
+ class RivuletDelta(dict):
41
+ """
42
+ Temporary class during merging of deltacat/rivulet metadata formats
43
+
44
+ This class currently serves two purposes:
45
+ 1. Avoid big bang refactor in which consumers of RivuletDelta have to update their code to consume deltacat Delta/Manifest
46
+ 2. Provide more time to figure out how to represent SST files / schema / etc within deltacat constructs
47
+
48
+ """
49
+
50
+ context: DeltaContext
51
+
52
+ @staticmethod
53
+ def of(delta: Delta) -> RivuletDelta:
54
+ riv_delta = RivuletDelta()
55
+ riv_delta["dcDelta"] = delta
56
+ schema = Schema.from_dict(delta.get("schema"))
57
+ riv_delta["DeltaContext"] = DeltaContext(
58
+ schema, delta.stream_position, delta.get("level")
59
+ )
60
+
61
+ return riv_delta
62
+
63
+ @property
64
+ def dcDelta(self) -> Delta:
65
+ return self.get("dcDelta")
66
+
67
+ @property
68
+ def sst_files(self) -> List[str]:
69
+ if "sst_files" not in self.keys():
70
+ self["sst_files"] = [m.uri for m in self.dcDelta.manifest.entries]
71
+ return self["sst_files"]
72
+
73
+ @sst_files.setter
74
+ def sst_files(self, files: List[str]):
75
+ self["sst_files"] = files
76
+
77
+ @property
78
+ def context(self) -> DeltaContext:
79
+ return self["DeltaContext"]
80
+
81
+ @context.setter
82
+ def context(self, mc: DeltaContext):
83
+ self["DeltaContext"] = mc
84
+
85
+
86
+ class ManifestIO(Protocol):
87
+ """
88
+ Minimal interface for reading and writing manifest files
89
+ """
90
+
91
+ def write(
92
+ self,
93
+ sst_files: List[str],
94
+ schema: Schema,
95
+ level: TreeLevel,
96
+ ) -> str:
97
+ ...
98
+
99
+ def read(self, file: str) -> RivuletDelta:
100
+ ...
101
+
102
+
103
+ class DeltacatManifestIO(ManifestIO):
104
+ """
105
+ Writes manifest data, but by writing to a Deltacat metastore using Deltacat delta/manifest classes.
106
+ """
107
+
108
+ def __init__(self, root: str, locator: PartitionLocator):
109
+ self.root = root
110
+ self.locator = locator
111
+
112
+ def write(
113
+ self,
114
+ sst_files: List[str],
115
+ schema: Schema,
116
+ level: TreeLevel,
117
+ ) -> str:
118
+ entry_list = ManifestEntryList()
119
+ """
120
+ Currently, we use the "data files" manifest entry field for SST files
121
+ This is a bit of a hack - we should consider how to better model SST files
122
+ (e.g.: add Manifest entry of type "SST") and decide whether we also need to record data files separately
123
+ even though they're referenced by SST
124
+ Ticket: https://github.com/ray-project/deltacat/issues/469
125
+ """
126
+ for sst_uri in sst_files:
127
+ entry_list.append(
128
+ ManifestEntry.of(
129
+ url=sst_uri,
130
+ # TODO have rivulet writer populate these values
131
+ # see: https://github.com/ray-project/deltacat/issues/476
132
+ meta=ManifestMeta.of(
133
+ record_count=None, # or known
134
+ content_length=None,
135
+ content_type=None,
136
+ content_encoding=None,
137
+ entry_type=EntryType.DATA,
138
+ ),
139
+ )
140
+ )
141
+ dc_manifest = Manifest.of(entries=entry_list)
142
+
143
+ # Create delta and transaction which writes manifest to root
144
+ # TODO replace this with higher level storage interface for deltacat
145
+ delta_locator = DeltaLocator.at(
146
+ namespace=self.locator.namespace,
147
+ table_name=self.locator.table_name,
148
+ table_version=self.locator.table_version,
149
+ partition_id=self.locator.partition_id,
150
+ partition_values=self.locator.partition_values,
151
+ stream_id=self.locator.stream_id,
152
+ stream_format=StreamFormat.DELTACAT,
153
+ # Using microsecond precision timestamp as stream position
154
+ # TODO consider having storage interface auto assign stream position
155
+ stream_position=time.time_ns(),
156
+ )
157
+
158
+ delta = Delta.of(
159
+ locator=delta_locator,
160
+ delta_type=DeltaType.APPEND,
161
+ meta=None,
162
+ properties={},
163
+ manifest=dc_manifest,
164
+ )
165
+ # TODO later support multiple schemas (https://github.com/ray-project/deltacat/issues/468)
166
+ delta["schema"] = schema.to_dict()
167
+ # TODO consider if level should be added as first class key to delta or
168
+ # kept as specific to storage interface
169
+ delta["level"] = level
170
+
171
+ tx_results = Transaction.of(
172
+ txn_type=TransactionType.APPEND,
173
+ txn_operations=TransactionOperationList.of(
174
+ [
175
+ TransactionOperation.of(
176
+ operation_type=TransactionOperationType.CREATE,
177
+ dest_metafile=delta,
178
+ )
179
+ ]
180
+ ),
181
+ ).commit(self.root)
182
+ paths = tx_results[0]
183
+ assert (
184
+ len(paths) == 1
185
+ ), "expected delta commit transaction to write exactly 1 metafile"
186
+ return paths[0]
187
+
188
+ def read(self, file: str):
189
+ delta = Delta.read(file)
190
+ return RivuletDelta.of(delta)
@@ -0,0 +1,105 @@
1
+ import logging
2
+ import json
3
+
4
+ from itertools import zip_longest
5
+ from typing import List
6
+
7
+ from deltacat.storage.rivulet.fs.input_file import InputFile
8
+ from deltacat.storage.rivulet.fs.output_file import OutputFile
9
+ from deltacat.storage.rivulet.metastore.sst import (
10
+ SSTWriter,
11
+ SSTableRow,
12
+ SSTReader,
13
+ SSTable,
14
+ )
15
+ from deltacat import logs
16
+
17
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
18
+
19
+
20
+ class JsonSstWriter(SSTWriter):
21
+ """
22
+ Class for reading and writing Json SST files
23
+
24
+ TODO use a more efficient format or compression. Also can factor out URI prefix across rows
25
+ We can also optimize by omitting offset_end if sequential rows use the same uri
26
+ """
27
+
28
+ def write(self, file: OutputFile, rows: List[SSTableRow]) -> None:
29
+ """
30
+ Writes SST file
31
+ """
32
+ if len(rows) == 0:
33
+ return
34
+
35
+ # File-level metadata for key min/max and offset end
36
+ min_key = rows[0].key_min
37
+ max_key = rows[-1].key_max
38
+ offset_end = rows[-1].offset_end
39
+
40
+ # Convert to dict for json serialization
41
+ file_rows = [
42
+ {
43
+ "key_min": row.key_min,
44
+ "key_max": row.key_max,
45
+ "offset": row.offset_start,
46
+ "uri": row.uri,
47
+ }
48
+ for row in rows
49
+ ]
50
+
51
+ file_as_dict = {
52
+ "key_min": min_key,
53
+ "key_max": max_key,
54
+ "offset_end": offset_end,
55
+ "metadata": file_rows,
56
+ }
57
+
58
+ try:
59
+ with file.create() as f:
60
+ f.write(json.dumps(file_as_dict).encode())
61
+ logger.debug(f"SSTable data successfully written to {file.location}")
62
+ except Exception as e:
63
+ # TODO better error handling for IO
64
+ logger.debug(f"Unexpected error occurred while writing SSTable data: {e}")
65
+ raise e
66
+
67
+
68
+ class JsonSstReader(SSTReader):
69
+ """
70
+ interface for reading SST files
71
+ """
72
+
73
+ def read(self, file: InputFile) -> SSTable:
74
+ with file.open() as f:
75
+ data = json.loads(f.read())
76
+ sst_rows: List[SSTableRow] = []
77
+ file_offset_end = data["offset_end"]
78
+
79
+ # each row only has "key", "offset", "uri"
80
+ # need to get "key_end", "offset_end" from either next row
81
+ # OR from top level metadata
82
+ for row1, row2 in zip_longest(data["metadata"], data["metadata"][1:]):
83
+ # if not row2, we are on the very last row and need to use file metadata for key and offset end
84
+ if not row2:
85
+ sst_rows.append(
86
+ SSTableRow(
87
+ row1["key_min"],
88
+ row1["key_max"],
89
+ row1["uri"],
90
+ row1["offset"],
91
+ file_offset_end,
92
+ )
93
+ )
94
+ else:
95
+ sst_rows.append(
96
+ SSTableRow(
97
+ row1["key_min"],
98
+ row1["key_max"],
99
+ row1["uri"],
100
+ row1["offset"],
101
+ file_offset_end,
102
+ )
103
+ )
104
+
105
+ return SSTable(sst_rows, data["key_min"], data["key_max"])
@@ -0,0 +1,82 @@
1
+ from dataclasses import dataclass
2
+ from typing import Protocol, Any, List
3
+
4
+ from deltacat.storage.rivulet.fs.input_file import InputFile
5
+ from deltacat.storage.rivulet.fs.output_file import OutputFile
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class SSTableRow:
10
+ """
11
+ Row of Sorted String Table.
12
+
13
+ The metadata for a SSTable row referencing a offset-range of a data file containing data sorted by primary key.
14
+
15
+ Note that the actual file format for SSTables can omit some of these fields (e.g. key_end and offset_end) taking
16
+ advantage of sorted nature of file.
17
+ """
18
+
19
+ key_min: Any
20
+ """The first primary key found in referenced data range, inclusive."""
21
+ key_max: Any
22
+ """the last primary key found in referenced data range, inclusive"""
23
+ uri: str
24
+ """The URI of the data file containing this row's data.
25
+ Will be format dependent, e.g. file://<absolute_path> or s3://<bucket>/<key>
26
+ Note that if uri_prefix is specified in SSTable, this will just be a postfix
27
+ """
28
+ offset_start: int
29
+ """
30
+ offset start for data range within uri.
31
+ Note this offset is format dependent - e.g. for Parquet it will be zero-indexed row group
32
+ For other formats it will be byte offset into file
33
+ """
34
+ offset_end: int
35
+ """
36
+ offset end for data range within uri.
37
+ """
38
+
39
+ """The starting offset into the data file for data referenced by this row.
40
+ Note that offset is format dependent.
41
+ E.g. for parquet files it is row group, for other formats it is byte offset
42
+ """
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class SSTable:
47
+ """
48
+ In memory representation of Sorted String Table
49
+
50
+ List of references to data file ranges with statistics to enable pruning by primary key.
51
+ """
52
+
53
+ rows: List[SSTableRow]
54
+ """Sorted List of rows by key"""
55
+ min_key: Any
56
+ """Minimum observed primary key across all rows."""
57
+ max_key: Any
58
+ """Maximum observed primary key across all rows."""
59
+
60
+
61
+ class SSTWriter(Protocol):
62
+ """
63
+ interface for writing SST files
64
+
65
+ Rows may be added iteratively. Input rows (within add_rows batch or across batches) MUST be sorted
66
+ by key_min
67
+ """
68
+
69
+ def write(self, file: OutputFile, rows: List[SSTableRow]) -> None:
70
+ """
71
+ Writes SST file
72
+ """
73
+ ...
74
+
75
+
76
+ class SSTReader(Protocol):
77
+ """
78
+ interface for reading SST files
79
+ """
80
+
81
+ def read(self, file: InputFile) -> SSTable:
82
+ ...