deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,378 @@
|
|
1
|
+
import heapq
|
2
|
+
import logging
|
3
|
+
|
4
|
+
from collections import defaultdict
|
5
|
+
from typing import (
|
6
|
+
Generator,
|
7
|
+
Dict,
|
8
|
+
Set,
|
9
|
+
Type,
|
10
|
+
TypeVar,
|
11
|
+
NamedTuple,
|
12
|
+
Any,
|
13
|
+
List,
|
14
|
+
Generic,
|
15
|
+
AbstractSet,
|
16
|
+
)
|
17
|
+
|
18
|
+
from deltacat.storage.rivulet.metastore.delta import DeltaContext
|
19
|
+
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
20
|
+
from deltacat.storage.rivulet.metastore.sst_interval_tree import (
|
21
|
+
OrderedBlockGroups,
|
22
|
+
BlockGroup,
|
23
|
+
Block,
|
24
|
+
)
|
25
|
+
from deltacat.storage.rivulet.reader.data_reader import RowAndKey, FileReader
|
26
|
+
from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
|
27
|
+
from deltacat.storage.rivulet.reader.pyarrow_data_reader import ArrowDataReader
|
28
|
+
from deltacat.storage.rivulet.reader.query_expression import QueryExpression
|
29
|
+
from deltacat.storage.rivulet.reader.reader_type_registrar import FileReaderRegistrar
|
30
|
+
from deltacat.storage.rivulet import Schema
|
31
|
+
from deltacat import logs
|
32
|
+
|
33
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
34
|
+
|
35
|
+
FILE_FORMAT = TypeVar("FILE_FORMAT")
|
36
|
+
MEMORY_FORMAT = TypeVar("MEMORY_FORMAT")
|
37
|
+
|
38
|
+
|
39
|
+
class FileReaderWithContext(NamedTuple):
|
40
|
+
reader: FileReader[FILE_FORMAT]
|
41
|
+
context: DeltaContext
|
42
|
+
|
43
|
+
|
44
|
+
class ZipperMergeHeapRecord(NamedTuple):
|
45
|
+
"""
|
46
|
+
Named tuple for data structure we're putting into heap during zipper merge
|
47
|
+
|
48
|
+
Note we override the equality/comparison operators to use key
|
49
|
+
so that we can add these items to a heap by key
|
50
|
+
"""
|
51
|
+
|
52
|
+
key: Any
|
53
|
+
data: FILE_FORMAT
|
54
|
+
reader: FileReaderWithContext
|
55
|
+
|
56
|
+
def __lt__(self, other):
|
57
|
+
return self.key < other.key
|
58
|
+
|
59
|
+
def __le__(self, other):
|
60
|
+
return self.key <= other.key
|
61
|
+
|
62
|
+
def __gt__(self, other):
|
63
|
+
return self.key > other.key
|
64
|
+
|
65
|
+
def __ge__(self, other):
|
66
|
+
return self.key >= other.key
|
67
|
+
|
68
|
+
|
69
|
+
class ZipperBlockScanExecutor(Generic[MEMORY_FORMAT]):
|
70
|
+
"""
|
71
|
+
Class for managing a zipper scan across multiple field groups. This class is only ever called inside the higher level BlockScanner class
|
72
|
+
|
73
|
+
It is factored into a dedicated class because of the complexity and state management of
|
74
|
+
zipper merging
|
75
|
+
"""
|
76
|
+
|
77
|
+
def __init__(
|
78
|
+
self,
|
79
|
+
result_schema: Schema,
|
80
|
+
deserialize_to: Type[MEMORY_FORMAT],
|
81
|
+
ordered_block_groups: OrderedBlockGroups,
|
82
|
+
query: QueryExpression[Any],
|
83
|
+
metastore: DatasetMetastore,
|
84
|
+
file_readers: Dict[str, FileReader],
|
85
|
+
):
|
86
|
+
|
87
|
+
self.result_schema = result_schema
|
88
|
+
self.deserialize_to = deserialize_to
|
89
|
+
self.ordered_block_groups = ordered_block_groups
|
90
|
+
self.query = query
|
91
|
+
self.metastore = metastore
|
92
|
+
self.file_readers = file_readers
|
93
|
+
"""
|
94
|
+
Keeps track of block file readers that are open, across block group boundaries. E.g., if Block Group 1 has
|
95
|
+
blocks [1,2,3] and BlockGroup2 has blocks [2,3], we will start reading blocks [2,3] and need to re-use the
|
96
|
+
open iterator while reading BlockGroup2
|
97
|
+
"""
|
98
|
+
self._open_file_readers: Dict[SSTableRow, FileReaderWithContext] = {}
|
99
|
+
|
100
|
+
def scan(self) -> Generator[MEMORY_FORMAT, None, None]:
|
101
|
+
"""
|
102
|
+
Perform N-wise zipper across N field groups.
|
103
|
+
Within each field group, there is a set of blocks which belong in this BlockGroup's key range
|
104
|
+
|
105
|
+
As a simplified example, we may have:
|
106
|
+
FieldGroup1: [BlockA, BlockB]
|
107
|
+
FieldGroup2: [BlockC]
|
108
|
+
BlockA: keys 1,3,9,10
|
109
|
+
BlockB: keys 2,4,5,6,7,8
|
110
|
+
BlockC: keys 1-10
|
111
|
+
|
112
|
+
The algorithm to merge these looks like:
|
113
|
+
1. Load each block in DataReader to get iterator over sorted keys
|
114
|
+
2. Build a heap of records across blocks across field groups
|
115
|
+
3. Pop record(s) from heap as long as they have equal keys. For up to N records, merge column wise
|
116
|
+
4. Continue until all blocks are read OR the key range in query is exceeded
|
117
|
+
"""
|
118
|
+
for block_group in self.ordered_block_groups.block_groups:
|
119
|
+
|
120
|
+
logger.debug(f"Starting scan of block group {block_group}")
|
121
|
+
|
122
|
+
# Set of all blocks that need to be read within this block group
|
123
|
+
blocks: set[Block] = {
|
124
|
+
block
|
125
|
+
for block_set in block_group.field_group_to_blocks.values()
|
126
|
+
for block in block_set
|
127
|
+
}
|
128
|
+
# Open all file readers, such that self._open_block_iterators has pointers to open readers
|
129
|
+
self.__open_file_readers(blocks)
|
130
|
+
record_heap: List[ZipperMergeHeapRecord] = []
|
131
|
+
|
132
|
+
# Seed record heap with record from each iterator
|
133
|
+
file_reader_context: FileReaderWithContext
|
134
|
+
for block, file_reader_context in self._open_file_readers.items():
|
135
|
+
self.__push_next_row_back_to_heap(
|
136
|
+
block_group, file_reader_context, record_heap
|
137
|
+
)
|
138
|
+
|
139
|
+
# For each zipper merged entry from heap traversal, delegate to deserializer
|
140
|
+
for zipper_merged in self.__zipper_merge_sorted_records(
|
141
|
+
record_heap, block_group
|
142
|
+
):
|
143
|
+
records = [z.data for z in self._dedupe_records(zipper_merged)]
|
144
|
+
# TODO (multi format support) we need to handle joining across data readers in the future
|
145
|
+
# For now, assume all data readers MUST read to Arrow intermediate format
|
146
|
+
for result in ArrowDataReader().join_deserialize_records(
|
147
|
+
records, self.deserialize_to, self.result_schema.get_merge_key()
|
148
|
+
):
|
149
|
+
yield result
|
150
|
+
|
151
|
+
def _dedupe_records(
|
152
|
+
self, records: List[ZipperMergeHeapRecord]
|
153
|
+
) -> List[ZipperMergeHeapRecord]:
|
154
|
+
"""Deduplicate records with the same key (as a sorted list of records).
|
155
|
+
|
156
|
+
Deduplication chooses records based on the following rules of precedence
|
157
|
+
|
158
|
+
1. Levels with lower numbers take precedence over levels with higher numbers (L0 is preferred over L1)
|
159
|
+
2. Newer stream positions take precedence over older stream positions
|
160
|
+
|
161
|
+
Undefined Behavior:
|
162
|
+
|
163
|
+
- Duplicate records within files from the same manifest (either in the same ir across data files)
|
164
|
+
|
165
|
+
TODO: allow for the definition of a 'dedupe' column to break ties.
|
166
|
+
"""
|
167
|
+
sort_criteria = lambda x: (
|
168
|
+
-x.reader.context.level,
|
169
|
+
x.reader.context.stream_position,
|
170
|
+
)
|
171
|
+
|
172
|
+
grouped_by_sort_group: defaultdict[
|
173
|
+
Schema, List[ZipperMergeHeapRecord]
|
174
|
+
] = defaultdict(list)
|
175
|
+
for record in records:
|
176
|
+
grouped_by_sort_group[record.reader.context.schema].append(record)
|
177
|
+
deduped = [
|
178
|
+
max(group, key=sort_criteria) for group in grouped_by_sort_group.values()
|
179
|
+
]
|
180
|
+
# Sort one last time across schemas (in case there's overlapping fields)
|
181
|
+
deduped.sort(key=sort_criteria)
|
182
|
+
return deduped
|
183
|
+
|
184
|
+
def __zipper_merge_sorted_records(
|
185
|
+
self, record_heap: List[ZipperMergeHeapRecord], block_group: BlockGroup
|
186
|
+
) -> Generator[List[ZipperMergeHeapRecord], None, None]:
|
187
|
+
"""
|
188
|
+
Continually pop from heap until heap empty OR block range exceeded. Generate "zipper merge" of records
|
189
|
+
|
190
|
+
Algorithm is:
|
191
|
+
(1) Pop lowest element from heap. Includes pointer to the iterator it came from.
|
192
|
+
Push next largest element from that generator back onto heap
|
193
|
+
(2) Buffer records of same key and peek/pop the heap as long as there is a key match
|
194
|
+
For any record popped, push next largest element from generator back onto heap
|
195
|
+
(3) Yield merged record by invoking Data Reader
|
196
|
+
|
197
|
+
This solution maintains the following invariants:
|
198
|
+
(1) the heap will have at most N records, where N=total blocks in BlockGroup
|
199
|
+
(2) the heap has the N smallest records globally
|
200
|
+
(3) any data that needs to be merged for a given key exists in the heap
|
201
|
+
|
202
|
+
:param record_heap: seeded heap of ZipperMergeHeapRecords.
|
203
|
+
:param block_group: block group being traversed
|
204
|
+
:return: generator of merged records. Note this is a list not a set to not require hash support
|
205
|
+
"""
|
206
|
+
if not record_heap:
|
207
|
+
return
|
208
|
+
|
209
|
+
# Keep iterating until heap is empty or key range is exceeded
|
210
|
+
while record_heap:
|
211
|
+
curr_heap_record = heapq.heappop(record_heap)
|
212
|
+
curr_pk = curr_heap_record.key
|
213
|
+
|
214
|
+
if not self.query.matches_query(curr_pk):
|
215
|
+
continue
|
216
|
+
|
217
|
+
# Sanity check - assert that key we are looking at is in block group's range
|
218
|
+
if not block_group.key_in_range(curr_pk):
|
219
|
+
raise RuntimeError(
|
220
|
+
f"Did not expect to find key {curr_pk} on zipper merge heap"
|
221
|
+
f"for block group {block_group}"
|
222
|
+
)
|
223
|
+
|
224
|
+
# Find all records to be merged by continuing to pop heap
|
225
|
+
merged_by_pk = [curr_heap_record]
|
226
|
+
# For the current record itself - push next row back to heap
|
227
|
+
self.__push_next_row_back_to_heap(
|
228
|
+
block_group, curr_heap_record.reader, record_heap
|
229
|
+
)
|
230
|
+
# For the rest of the heap elements - peek/pop as long as they equal key
|
231
|
+
# Note that heap[0] is equivalent to peek operation
|
232
|
+
while record_heap and record_heap[0][0] == curr_pk:
|
233
|
+
merge_heap_record: ZipperMergeHeapRecord = heapq.heappop(record_heap)
|
234
|
+
merged_by_pk.append(merge_heap_record)
|
235
|
+
self.__push_next_row_back_to_heap(
|
236
|
+
block_group, merge_heap_record.reader, record_heap
|
237
|
+
)
|
238
|
+
yield merged_by_pk
|
239
|
+
|
240
|
+
def __push_next_row_back_to_heap(
|
241
|
+
self,
|
242
|
+
block_group: BlockGroup,
|
243
|
+
row_context: FileReaderWithContext,
|
244
|
+
record_heap: List[ZipperMergeHeapRecord],
|
245
|
+
):
|
246
|
+
"""
|
247
|
+
This is a helper function for __zipper_merge_sorted_records and for scan().
|
248
|
+
|
249
|
+
Given a file reader, it will next() records until it finds the next record within the block group
|
250
|
+
and current query. It then pushes that record onto the heap
|
251
|
+
|
252
|
+
Sometimes we end up needing to seek into the middle of a block because the key range of a query starts
|
253
|
+
in the middle of the block. For example, if the block has keys range [0,100],
|
254
|
+
and the query is for keys [50-100], we need to seek to the first key in the block that is >= 50
|
255
|
+
|
256
|
+
TODO better support for seeking within block (rather than O(N) iteration)
|
257
|
+
"""
|
258
|
+
|
259
|
+
file_reader = row_context.reader
|
260
|
+
while file_reader.peek() is not None and (
|
261
|
+
block_group.key_below_range(file_reader.peek().key)
|
262
|
+
or self.query.below_query_range(file_reader.peek().key)
|
263
|
+
):
|
264
|
+
try:
|
265
|
+
# call next() on file reader to throw out key which is below range of block group
|
266
|
+
next(file_reader)
|
267
|
+
except StopIteration:
|
268
|
+
# If we have exhausted iterator, this just means no keys from this block actually match the query
|
269
|
+
file_reader.close()
|
270
|
+
# TODO how to remove file reader from _open_file_readers?
|
271
|
+
|
272
|
+
if (
|
273
|
+
file_reader.peek()
|
274
|
+
and self.query.matches_query(file_reader.peek().key)
|
275
|
+
and block_group.key_in_range(file_reader.peek().key)
|
276
|
+
):
|
277
|
+
try:
|
278
|
+
r: RowAndKey = next(file_reader)
|
279
|
+
heapq.heappush(
|
280
|
+
record_heap,
|
281
|
+
ZipperMergeHeapRecord(r.key, r.row, row_context),
|
282
|
+
)
|
283
|
+
except StopIteration:
|
284
|
+
# This means we have exhausted the open FileReader and should close it
|
285
|
+
file_reader.__exit__()
|
286
|
+
# TODO how to remove file reader from _open_file_readers?
|
287
|
+
|
288
|
+
def __open_file_readers(self, blocks: AbstractSet[Block]):
|
289
|
+
"""
|
290
|
+
This method should be called once per block group.
|
291
|
+
It opens iterators across all blocks in the block group and stores them in a map
|
292
|
+
Blocks may already be open, if they were also in previous block groups.
|
293
|
+
"""
|
294
|
+
for block in blocks:
|
295
|
+
sst_row: SSTableRow = block.row
|
296
|
+
if sst_row not in self._open_file_readers:
|
297
|
+
file_reader = FileReaderRegistrar.construct_reader_instance(
|
298
|
+
sst_row,
|
299
|
+
self.metastore.file_provider,
|
300
|
+
self.result_schema.get_merge_key(),
|
301
|
+
self.result_schema,
|
302
|
+
self.file_readers,
|
303
|
+
)
|
304
|
+
file_reader.__enter__()
|
305
|
+
# TODO we need some way to compare the blocks. using serialized timestamp as proxy for now
|
306
|
+
context = FileReaderWithContext(file_reader, block.context)
|
307
|
+
self._open_file_readers[sst_row] = context
|
308
|
+
|
309
|
+
|
310
|
+
class BlockScanner:
|
311
|
+
"""
|
312
|
+
BlockScanner is a low level internal class which performs IO on Block Groups
|
313
|
+
|
314
|
+
Note that we expect a block scanner to be initialized PER QUERY because it will keep state about ongoing execution,
|
315
|
+
e.g. open iterators across block groups
|
316
|
+
|
317
|
+
TODO efficiency improvements like parallelizing scanning.
|
318
|
+
TODO handle "partial schema" use case, in which the query schema is a subset of full schema
|
319
|
+
TODO in the future we will probably want to cache blocks read across queries
|
320
|
+
"""
|
321
|
+
|
322
|
+
def __init__(self, metastore: DatasetMetastore):
|
323
|
+
# Persist initialized file readers
|
324
|
+
self.metastore = metastore
|
325
|
+
self.file_readers: Dict[str, FileReader] = {}
|
326
|
+
|
327
|
+
def scan(
|
328
|
+
self,
|
329
|
+
schema: Schema,
|
330
|
+
deserialize_to: Type[MEMORY_FORMAT],
|
331
|
+
blocks: Set[SSTableRow],
|
332
|
+
query: QueryExpression[Any](),
|
333
|
+
) -> Generator[MEMORY_FORMAT, None, None]:
|
334
|
+
"""
|
335
|
+
Scan records given query and deserialize to desired memory output format
|
336
|
+
Set of blocks can all be scanned and returned independently
|
337
|
+
TODO handle "partial schema" use case, in which the query schema is a subset of full schema
|
338
|
+
TODO parallelize scan with async io
|
339
|
+
"""
|
340
|
+
data_reader = ArrowDataReader()
|
341
|
+
for block in blocks:
|
342
|
+
file_reader = FileReaderRegistrar.construct_reader_instance(
|
343
|
+
block,
|
344
|
+
self.metastore.file_provider,
|
345
|
+
schema.get_merge_key(),
|
346
|
+
schema,
|
347
|
+
self.file_readers,
|
348
|
+
)
|
349
|
+
with file_reader:
|
350
|
+
for generated_records in file_reader.__iter__():
|
351
|
+
# Check whether row matches key in query before deserializing
|
352
|
+
if query.key_range:
|
353
|
+
start, end = query.key_range
|
354
|
+
if generated_records.key < start or generated_records.key > end:
|
355
|
+
continue
|
356
|
+
|
357
|
+
# Otherwise, key predicate matched and yield deserialized row
|
358
|
+
for deserialized_row in data_reader.deserialize_records(
|
359
|
+
generated_records, deserialize_to
|
360
|
+
):
|
361
|
+
yield deserialized_row
|
362
|
+
|
363
|
+
def scan_with_zipper(
|
364
|
+
self,
|
365
|
+
schema: Schema,
|
366
|
+
deserialize_to: Type[MEMORY_FORMAT],
|
367
|
+
ordered_block_groups: OrderedBlockGroups,
|
368
|
+
query: QueryExpression[Any](),
|
369
|
+
) -> Generator[MEMORY_FORMAT, None, None]:
|
370
|
+
zipper_scan_executor = ZipperBlockScanExecutor(
|
371
|
+
schema,
|
372
|
+
deserialize_to,
|
373
|
+
ordered_block_groups,
|
374
|
+
query,
|
375
|
+
self.metastore,
|
376
|
+
self.file_readers,
|
377
|
+
)
|
378
|
+
return zipper_scan_executor.scan()
|
@@ -0,0 +1,136 @@
|
|
1
|
+
import typing
|
2
|
+
from abc import abstractmethod
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import (
|
5
|
+
Protocol,
|
6
|
+
Generator,
|
7
|
+
Any,
|
8
|
+
TypeVar,
|
9
|
+
Type,
|
10
|
+
Generic,
|
11
|
+
List,
|
12
|
+
Iterator,
|
13
|
+
Optional,
|
14
|
+
)
|
15
|
+
|
16
|
+
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
17
|
+
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
18
|
+
from deltacat.storage.rivulet.schema.schema import Schema
|
19
|
+
|
20
|
+
FILE_FORMAT = TypeVar("FILE_FORMAT")
|
21
|
+
MEMORY_FORMAT = TypeVar("MEMORY_FORMAT")
|
22
|
+
|
23
|
+
T = TypeVar("T")
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass
|
27
|
+
class RowAndKey(Generic[FILE_FORMAT]):
|
28
|
+
"""
|
29
|
+
Named tuple for a record batch with an index into a specific row
|
30
|
+
Note that record batches store data by column, so the row index should be
|
31
|
+
used to index into each column array
|
32
|
+
"""
|
33
|
+
|
34
|
+
row: FILE_FORMAT
|
35
|
+
key: Any
|
36
|
+
|
37
|
+
|
38
|
+
class FileReader(
|
39
|
+
Protocol[FILE_FORMAT],
|
40
|
+
Iterator[RowAndKey[FILE_FORMAT]],
|
41
|
+
typing.ContextManager,
|
42
|
+
):
|
43
|
+
"""
|
44
|
+
Interface for reading specific file
|
45
|
+
|
46
|
+
TODO (IO abstraction) we will need to think about how various IO interfaces (S3, filesystem, memory)
|
47
|
+
plug into this.
|
48
|
+
"""
|
49
|
+
|
50
|
+
@abstractmethod
|
51
|
+
def __init__(
|
52
|
+
self,
|
53
|
+
sst_row: SSTableRow,
|
54
|
+
file_provider: FileProvider,
|
55
|
+
primary_key: str,
|
56
|
+
schema: Schema,
|
57
|
+
) -> None:
|
58
|
+
"""
|
59
|
+
Required constructor (see: FileReaderRegistrar)
|
60
|
+
|
61
|
+
:param sst_row: SSTableRow containing file metadata
|
62
|
+
:param file_store: Object providing file access
|
63
|
+
"""
|
64
|
+
...
|
65
|
+
|
66
|
+
@abstractmethod
|
67
|
+
def peek(self) -> Optional[RowAndKey[FILE_FORMAT]]:
|
68
|
+
"""
|
69
|
+
Peek at the next RowAndPrimaryKey without advancing the iterator
|
70
|
+
:return: Optional of RowAndPrimaryKey
|
71
|
+
"""
|
72
|
+
...
|
73
|
+
|
74
|
+
@abstractmethod
|
75
|
+
def __next__(self) -> RowAndKey[FILE_FORMAT]:
|
76
|
+
"""
|
77
|
+
Fetch the next RowAndPrimaryKey and advance iterator
|
78
|
+
"""
|
79
|
+
...
|
80
|
+
|
81
|
+
@abstractmethod
|
82
|
+
def close(self):
|
83
|
+
"""
|
84
|
+
Explicit add close so that resources can be cleaned up outside the ContextManager.
|
85
|
+
|
86
|
+
We expect that callers opening the reader can EITHER use a with statement or call __enter__()
|
87
|
+
Callers closing the reader can EITHER explicitly call close() or have with statement manage calling __exit__
|
88
|
+
"""
|
89
|
+
...
|
90
|
+
|
91
|
+
|
92
|
+
class DataReader(Protocol[FILE_FORMAT]):
|
93
|
+
"""
|
94
|
+
Interface for reading specific file formats
|
95
|
+
A DatasetReader uses a different DataReader for each format
|
96
|
+
|
97
|
+
TODO (IO abstraction) we will need to think about how various IO interfaces (S3, filesystem, memory)
|
98
|
+
plug into this.
|
99
|
+
"""
|
100
|
+
|
101
|
+
@abstractmethod
|
102
|
+
def deserialize_records(
|
103
|
+
self, records: FILE_FORMAT, output_type: Type[MEMORY_FORMAT]
|
104
|
+
) -> Generator[MEMORY_FORMAT, None, None]:
|
105
|
+
"""
|
106
|
+
Deserialize records into the specified format.
|
107
|
+
|
108
|
+
Note that output_type gets set based on what a DataScan converts results to,
|
109
|
+
e.g. to_arrow, to_dict
|
110
|
+
|
111
|
+
:param records: Input data (generated by generate_records method)
|
112
|
+
:param output_type: Type to deserialize into
|
113
|
+
:returns: A generator yielding records of the specified type.
|
114
|
+
"""
|
115
|
+
...
|
116
|
+
|
117
|
+
@abstractmethod
|
118
|
+
def join_deserialize_records(
|
119
|
+
self,
|
120
|
+
records: List[FILE_FORMAT],
|
121
|
+
output_type: Type[MEMORY_FORMAT],
|
122
|
+
join_key: str,
|
123
|
+
) -> Generator[MEMORY_FORMAT, None, None]:
|
124
|
+
"""
|
125
|
+
Deserialize records into the specified format.
|
126
|
+
|
127
|
+
Note that output_type gets set based on what a DataScan converts results to,
|
128
|
+
e.g. to_arrow, to_dict
|
129
|
+
|
130
|
+
:param records: Multiple records which should be merged into final output record
|
131
|
+
Note this is a list instead of a set to not enforce hashability
|
132
|
+
:param join_key name of field to join across record. This field must be present on all records
|
133
|
+
:param output_type: Type to deserialize into
|
134
|
+
:returns: A generator yielding records of the specified type.
|
135
|
+
"""
|
136
|
+
...
|
@@ -0,0 +1,63 @@
|
|
1
|
+
from typing import Generator, Dict, Optional
|
2
|
+
|
3
|
+
import pyarrow as pa
|
4
|
+
|
5
|
+
from deltacat.storage.model.shard import Shard
|
6
|
+
from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
|
7
|
+
from deltacat.storage.rivulet.reader.query_expression import QueryExpression
|
8
|
+
from deltacat.storage.rivulet import Schema
|
9
|
+
|
10
|
+
|
11
|
+
class DataScan:
|
12
|
+
"""
|
13
|
+
Top level class representing and executing a data scan, on both riv internal and external data
|
14
|
+
This class is lazy, and executed when the user calls a method "to_{format}"
|
15
|
+
to deserialize data into the chosen in-memory format
|
16
|
+
|
17
|
+
Dataset.py scan() is the entrypoint to create and return data scan. The user
|
18
|
+
then has to chain a "to_{format}" method to read rows in their chosen in-memory format
|
19
|
+
|
20
|
+
Rivulet cannot simply return file URIs and allow query engine to process files,
|
21
|
+
because rivulet will internally manage details like indexes, custom file formats for bulk records, where data is physically laid out across row groups, etc.
|
22
|
+
|
23
|
+
DataScan allows query engines to send push down predicates. Push down predicates are used to filter on dimensions natively indexed by riv (e.g. primary key), and also
|
24
|
+
|
25
|
+
DataScan is coupled to internals of the riv rivulet format. If the rivulet format evolves, DataScan execution hould be able to understand which rivulet spec version is used and be compatible with any valid rivule rivulet.
|
26
|
+
|
27
|
+
FUTURE IMPROVEMENTS
|
28
|
+
1. Implement full spec for push down predicates
|
29
|
+
2. Figure out how permissions/credential providers work.
|
30
|
+
3. Figure out how extension libraries can plug in to_x deserialization support.
|
31
|
+
One potential option is to override __getattr__ and check a static class-level Registry
|
32
|
+
of to_x methods. Modules would have to import DataScan and call DataScan.register_deserializer(...)
|
33
|
+
"""
|
34
|
+
|
35
|
+
def __init__(
|
36
|
+
self,
|
37
|
+
dataset_schema: Schema,
|
38
|
+
query: QueryExpression,
|
39
|
+
dataset_reader: DatasetReader,
|
40
|
+
shard: Optional[Shard],
|
41
|
+
):
|
42
|
+
self.dataset_schema = dataset_schema
|
43
|
+
self.query = query
|
44
|
+
self.dataset_reader = dataset_reader
|
45
|
+
self.shard = shard
|
46
|
+
|
47
|
+
def to_arrow(self) -> Generator[pa.RecordBatch, None, None]:
|
48
|
+
"""
|
49
|
+
Generates scan results as arrow record batches
|
50
|
+
|
51
|
+
TODO how to make the .to_x methods pluggable?
|
52
|
+
"""
|
53
|
+
return self.dataset_reader.scan(
|
54
|
+
self.dataset_schema, pa.RecordBatch, self.query, shard=self.shard
|
55
|
+
)
|
56
|
+
|
57
|
+
def to_pydict(self) -> Generator[Dict, None, None]:
|
58
|
+
"""
|
59
|
+
Generates scan results as a Dict for each row
|
60
|
+
"""
|
61
|
+
return self.dataset_reader.scan(
|
62
|
+
self.dataset_schema, Dict, self.query, shard=self.shard
|
63
|
+
)
|