deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.0b2.dist-info/METADATA +65 -0
- deltacat-2.0.0b2.dist-info/RECORD +349 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,87 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import List, Callable, Any
|
4
|
+
|
5
|
+
from deltacat.storage.rivulet.field_group import FieldGroup
|
6
|
+
from deltacat.storage.rivulet.mvp.Table import MvpTable
|
7
|
+
from deltacat.storage.rivulet import Schema
|
8
|
+
from deltacat.storage.rivulet.reader.data_scan import DataScan
|
9
|
+
from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
|
10
|
+
from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
|
11
|
+
from deltacat.storage.rivulet.reader.query_expression import QueryExpression
|
12
|
+
|
13
|
+
|
14
|
+
class DatasetExecutor:
|
15
|
+
"""
|
16
|
+
Executor class which runs operations such as select, map, take, save
|
17
|
+
|
18
|
+
This class may store intermediate state while it is executing.
|
19
|
+
|
20
|
+
LogicalPlan is responsible for constructor an executor and ordering operations appropriately
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
field_groups: List[FieldGroup],
|
26
|
+
schema: Schema,
|
27
|
+
metastore: DatasetMetastore,
|
28
|
+
):
|
29
|
+
self.effective_schema: Schema = schema.__deepcopy__()
|
30
|
+
self.field_groups = field_groups
|
31
|
+
self.output: MvpTable | None = None
|
32
|
+
self._metastore = metastore
|
33
|
+
|
34
|
+
def collect(self) -> MvpTable:
|
35
|
+
if not self.output:
|
36
|
+
self.output = self._read(self.effective_schema)
|
37
|
+
return self.output
|
38
|
+
|
39
|
+
def select(self, fields: List[str]) -> "DatasetExecutor":
|
40
|
+
"""
|
41
|
+
Reads data and selects a subset of fields
|
42
|
+
Note that this implementation is super inefficient (does not push down filters to read, copies data to new MvpTable). That is OK since this will all be replaced
|
43
|
+
"""
|
44
|
+
# Read data from original input sources if not already read
|
45
|
+
if not self.output:
|
46
|
+
self.output = self._read(self.effective_schema)
|
47
|
+
# Calculate effective schema and apply it to data
|
48
|
+
self.effective_schema.filter(fields)
|
49
|
+
self.output = MvpTable(
|
50
|
+
{
|
51
|
+
key: value
|
52
|
+
for key, value in self.output.data.items()
|
53
|
+
if key in self.effective_schema
|
54
|
+
},
|
55
|
+
)
|
56
|
+
return self
|
57
|
+
|
58
|
+
def map(self, transform: Callable[[Any], Any]) -> "DatasetExecutor":
|
59
|
+
raise NotImplementedError
|
60
|
+
|
61
|
+
def _read(self, schema: Schema) -> MvpTable:
|
62
|
+
"""
|
63
|
+
Internal helper method to read data
|
64
|
+
|
65
|
+
TODO for now this is doing dumb in-memory implementation and later this is going to be replaced by rust library
|
66
|
+
"""
|
67
|
+
if len(self.field_groups) == 1:
|
68
|
+
return self._read_as_mvp_table(schema, self.field_groups[0])
|
69
|
+
else:
|
70
|
+
ds1 = self._read_as_mvp_table(schema, self.field_groups[0])
|
71
|
+
ds2 = self._read_as_mvp_table(schema, self.field_groups[1])
|
72
|
+
merged = MvpTable.merge(ds1, ds2, schema.primary_key.name)
|
73
|
+
for i in range(2, len(self.field_groups)):
|
74
|
+
ds_i = self._read_as_mvp_table(schema, self.field_groups[i])
|
75
|
+
merged = MvpTable.merge(merged, ds_i, schema.primary_key.name)
|
76
|
+
return merged
|
77
|
+
|
78
|
+
def _read_as_mvp_table(self, schema: Schema, field_group: FieldGroup):
|
79
|
+
data = list(
|
80
|
+
DataScan(
|
81
|
+
schema, QueryExpression(), DatasetReader(self._metastore)
|
82
|
+
).to_pydict()
|
83
|
+
)
|
84
|
+
output = {}
|
85
|
+
for key in schema.fields.keys():
|
86
|
+
output[key] = [d.get(key) for d in data]
|
87
|
+
return MvpTable(output)
|
@@ -0,0 +1,5 @@
|
|
1
|
+
# TODO later on this will be moved to a dedicated package
|
2
|
+
from deltacat.storage.rivulet.feather.file_reader import FeatherFileReader
|
3
|
+
from deltacat.storage.rivulet.reader.reader_type_registrar import FileReaderRegistrar
|
4
|
+
|
5
|
+
FileReaderRegistrar.register_reader("feather", FeatherFileReader)
|
@@ -0,0 +1,136 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
import pyarrow.ipc
|
6
|
+
from pyarrow import RecordBatch, RecordBatchFileReader
|
7
|
+
|
8
|
+
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
9
|
+
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
10
|
+
from deltacat.storage.rivulet.reader.data_reader import (
|
11
|
+
RowAndKey,
|
12
|
+
FileReader,
|
13
|
+
FILE_FORMAT,
|
14
|
+
)
|
15
|
+
from deltacat.storage.rivulet.reader.pyarrow_data_reader import RecordBatchRowIndex
|
16
|
+
from deltacat.storage.rivulet.schema.schema import Schema
|
17
|
+
|
18
|
+
|
19
|
+
class FeatherFileReader(FileReader[RecordBatchRowIndex]):
|
20
|
+
"""
|
21
|
+
Feather file reader. This class is not thread safe
|
22
|
+
|
23
|
+
This is mostly a copy-pasta from ParquetFileReader
|
24
|
+
TODO can consider abstracting code between this and ParquetFileReader
|
25
|
+
"""
|
26
|
+
|
27
|
+
def __init__(
|
28
|
+
self,
|
29
|
+
sst_row: SSTableRow,
|
30
|
+
file_provider: FileProvider,
|
31
|
+
primary_key: str,
|
32
|
+
schema: Schema,
|
33
|
+
):
|
34
|
+
self.sst_row = sst_row
|
35
|
+
self.input = file_provider.provide_input_file(sst_row.uri)
|
36
|
+
|
37
|
+
self.key = primary_key
|
38
|
+
self.feather_file = sst_row.uri
|
39
|
+
|
40
|
+
self.schema = schema
|
41
|
+
|
42
|
+
# Iterator from pyarrow iter_batches API call. Pyarrow manages state of traversal within parquet row groups
|
43
|
+
|
44
|
+
"""
|
45
|
+
These variables keep state about where the iterator is current at. They are initialized in __enter__()
|
46
|
+
"""
|
47
|
+
self._curr_batch: RecordBatch | None = None
|
48
|
+
self._feather_reader: RecordBatchFileReader | None = None
|
49
|
+
# Arrow only lets you read feather files chunk by chunk using
|
50
|
+
# RecordBatchFileReader.get_batch(index)
|
51
|
+
self._curr_batch_index = 0
|
52
|
+
self._curr_row_offset = 0
|
53
|
+
self._pk_col = None
|
54
|
+
|
55
|
+
def peek(self) -> Optional[RowAndKey[FILE_FORMAT]]:
|
56
|
+
"""
|
57
|
+
Peek next record
|
58
|
+
|
59
|
+
Note that there is an edge case where peek() is called on the bounary between record batches
|
60
|
+
This only happens curr_row_offset == curr_batch.num_rows, meaning next() or peek() would need to advance
|
61
|
+
to the next record batch. When this happens, peek() increments _curr_batch and sets _curr_row_offset to 0
|
62
|
+
|
63
|
+
:return: Optional of RowAndPrimaryKey
|
64
|
+
"""
|
65
|
+
if not self.__is_initialized():
|
66
|
+
raise RuntimeError(
|
67
|
+
"ParquetFileReader must be initialized with __enter__ before reading"
|
68
|
+
)
|
69
|
+
|
70
|
+
if self.__need_to_advance_record_batch():
|
71
|
+
try:
|
72
|
+
self.__advance_record_batch()
|
73
|
+
except StopIteration:
|
74
|
+
return None
|
75
|
+
|
76
|
+
pk = self._pk_col[self._curr_row_offset].as_py()
|
77
|
+
return RowAndKey(
|
78
|
+
RecordBatchRowIndex(self._curr_batch, self._curr_row_offset), pk
|
79
|
+
)
|
80
|
+
|
81
|
+
def __next__(self) -> RowAndKey[FILE_FORMAT]:
|
82
|
+
if not self.__is_initialized():
|
83
|
+
raise RuntimeError(
|
84
|
+
"ParquetFileReader must be initialized with __enter__ before reading"
|
85
|
+
)
|
86
|
+
|
87
|
+
if self.__need_to_advance_record_batch():
|
88
|
+
self.__advance_record_batch()
|
89
|
+
pk = self._pk_col[0].as_py()
|
90
|
+
return RowAndKey(RecordBatchRowIndex(self._curr_batch, 0), pk)
|
91
|
+
else:
|
92
|
+
pk = self._pk_col[self._curr_row_offset].as_py()
|
93
|
+
offset = self._curr_row_offset
|
94
|
+
self._curr_row_offset += 1
|
95
|
+
return RowAndKey(RecordBatchRowIndex(self._curr_batch, offset), pk)
|
96
|
+
|
97
|
+
def __enter__(self):
|
98
|
+
with self.input.open() as f:
|
99
|
+
self._feather_reader = pyarrow.ipc.RecordBatchFileReader(f)
|
100
|
+
self.__advance_record_batch()
|
101
|
+
|
102
|
+
def __exit__(self, __exc_type, __exc_value, __traceback):
|
103
|
+
self.close()
|
104
|
+
# return False to propagate up error messages
|
105
|
+
return False
|
106
|
+
|
107
|
+
def close(self):
|
108
|
+
# no op
|
109
|
+
return
|
110
|
+
|
111
|
+
def __is_initialized(self):
|
112
|
+
return self._curr_batch and self._pk_col
|
113
|
+
|
114
|
+
def __need_to_advance_record_batch(self):
|
115
|
+
return not self._curr_row_offset < self._curr_batch.num_rows
|
116
|
+
|
117
|
+
def __advance_record_batch(self):
|
118
|
+
"""
|
119
|
+
Advance to next record batch
|
120
|
+
:raise StopIteration: If there are no more record batches
|
121
|
+
"""
|
122
|
+
try:
|
123
|
+
self._curr_batch = self._feather_reader.get_batch(self._curr_batch_index)
|
124
|
+
self._curr_batch_index += 1
|
125
|
+
self._curr_row_offset = 0
|
126
|
+
self._pk_col = self._curr_batch[self.key]
|
127
|
+
# Filter the batch to only include fields in the schema
|
128
|
+
# Pyarrow select will throw a ValueError if the field is not in the schema
|
129
|
+
fields = [
|
130
|
+
field
|
131
|
+
for field in self.schema.keys()
|
132
|
+
if field in self._curr_batch.schema.names
|
133
|
+
]
|
134
|
+
self._curr_batch = self._curr_batch.select(fields)
|
135
|
+
except ValueError:
|
136
|
+
raise StopIteration(f"Ended iteration at batch {self._curr_batch_index}")
|
@@ -0,0 +1,35 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
import pyarrow as pa
|
4
|
+
from pyarrow import feather
|
5
|
+
|
6
|
+
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
7
|
+
from deltacat.storage.rivulet import Schema
|
8
|
+
from deltacat.storage.rivulet.arrow.serializer import ArrowSerializer
|
9
|
+
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
10
|
+
|
11
|
+
|
12
|
+
class FeatherDataSerializer(ArrowSerializer):
|
13
|
+
"""
|
14
|
+
Feather data writer. Responsible for flushing rows to feather files and returning SSTable rows for any file(s) written
|
15
|
+
|
16
|
+
TODO Support recording byte range offsets. Deferring this for now
|
17
|
+
We may need to provide a wrapper class over fsspec which introspects how many bytes written
|
18
|
+
when .write is called on output stream
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(self, file_provider: FileProvider, schema: Schema):
|
22
|
+
super().__init__(file_provider, schema)
|
23
|
+
|
24
|
+
def serialize(self, table: pa.Table) -> List[SSTableRow]:
|
25
|
+
file = self.file_provider.provide_data_file("feather")
|
26
|
+
|
27
|
+
with file.create() as outfile:
|
28
|
+
# Note that write_feather says that dest is a string, but it is really any object implementing write()
|
29
|
+
feather.write_feather(table, dest=outfile)
|
30
|
+
|
31
|
+
# Because we only write one row group, it only creates one SSTableRow
|
32
|
+
# we may have more granular SST indexes for other formats
|
33
|
+
key_min, key_max = self._get_min_max_key(table)
|
34
|
+
# TODO need to populate byte offsets. For now, we are writing single files per SSTableRow
|
35
|
+
return [SSTableRow(key_min, key_max, file.location, 0, 0)]
|
File without changes
|
@@ -0,0 +1,105 @@
|
|
1
|
+
import posixpath
|
2
|
+
import time
|
3
|
+
from typing import List, Generator
|
4
|
+
|
5
|
+
from deltacat.storage.model.partition import PartitionLocator
|
6
|
+
from deltacat.storage.rivulet.fs.file_store import FileStore
|
7
|
+
from deltacat.storage.rivulet.fs.input_file import InputFile
|
8
|
+
from deltacat.storage.rivulet.fs.output_file import OutputFile
|
9
|
+
from deltacat.utils.metafile_locator import _find_partition_path
|
10
|
+
|
11
|
+
|
12
|
+
class FileProvider:
|
13
|
+
"""
|
14
|
+
Manages the generation of URIs for data and metadata files and facilitates the creation of files at those URIs.
|
15
|
+
All files are generated relative to the root of the storage location.
|
16
|
+
|
17
|
+
This class is inspired by the Iceberg `LocationProvider` and provides methods
|
18
|
+
to generate paths for various types of files (e.g., data files, SSTs, and manifests)
|
19
|
+
while maintaining a clear structure within the dataset.
|
20
|
+
|
21
|
+
TODO (deltacat): FileProvider will be replaced/refactored once we are able to integrate with Deltacat.
|
22
|
+
TODO: Incorporate additional file naming conventions, such as including
|
23
|
+
partitionId, taskId, and operationId, to improve traceability and
|
24
|
+
idempotency.
|
25
|
+
"""
|
26
|
+
|
27
|
+
uri: str
|
28
|
+
|
29
|
+
def __init__(self, uri: str, locator: PartitionLocator, file_store: FileStore):
|
30
|
+
"""
|
31
|
+
Initializes the file provider.
|
32
|
+
|
33
|
+
param: uri: Base URI of the dataset.
|
34
|
+
param: file_store: FileStore instance for creating and reading files.
|
35
|
+
"""
|
36
|
+
self.uri = uri
|
37
|
+
self._locator = locator
|
38
|
+
self._file_store = file_store
|
39
|
+
|
40
|
+
def provide_data_file(self, extension: str) -> OutputFile:
|
41
|
+
"""
|
42
|
+
Creates a new data file.
|
43
|
+
|
44
|
+
TODO: Ensure storage interface can provide data files.
|
45
|
+
|
46
|
+
param: extension: File extension (e.g., "parquet").
|
47
|
+
returns: OutputFile instance pointing to the created data file.
|
48
|
+
"""
|
49
|
+
partition_path = _find_partition_path(self.uri, self._locator)
|
50
|
+
uri = posixpath.join(
|
51
|
+
partition_path, "data", f"{int(time.time_ns())}.{extension}"
|
52
|
+
)
|
53
|
+
return self._file_store.create_output_file(uri)
|
54
|
+
|
55
|
+
def provide_l0_sst_file(self) -> OutputFile:
|
56
|
+
"""
|
57
|
+
Creates a new L0 SST file.
|
58
|
+
|
59
|
+
TODO: Ensure storage interface can provide sst files.
|
60
|
+
|
61
|
+
returns: OutputFile instance pointing to the created SST file.
|
62
|
+
"""
|
63
|
+
partition_path = _find_partition_path(self.uri, self._locator)
|
64
|
+
uri = posixpath.join(
|
65
|
+
partition_path, "metadata", "ssts", "0", f"{int(time.time_ns())}.json"
|
66
|
+
)
|
67
|
+
return self._file_store.create_output_file(uri)
|
68
|
+
|
69
|
+
def provide_input_file(self, uri: str) -> InputFile:
|
70
|
+
"""
|
71
|
+
Reads an existing file.
|
72
|
+
|
73
|
+
param sst_uri: URI of the file to read.
|
74
|
+
returns: InputFile instance for the specified URI.
|
75
|
+
"""
|
76
|
+
return self._file_store.create_input_file(uri)
|
77
|
+
|
78
|
+
def provide_manifest_file(self) -> OutputFile:
|
79
|
+
"""
|
80
|
+
Creates a new manifest file.
|
81
|
+
|
82
|
+
returns: OutputFile instance pointing to the created manifest file.
|
83
|
+
"""
|
84
|
+
uri = f"{self.uri}/metadata/manifests/{int(time.time_ns())}.json"
|
85
|
+
return self._file_store.create_output_file(uri)
|
86
|
+
|
87
|
+
def get_sst_scan_directories(self) -> List[str]:
|
88
|
+
"""
|
89
|
+
Retrieves SST scan directories.
|
90
|
+
|
91
|
+
returns: List of directories containing SSTs.
|
92
|
+
"""
|
93
|
+
partition_path = _find_partition_path(self.uri, self._locator)
|
94
|
+
return [f"{partition_path}/metadata/ssts/0/"]
|
95
|
+
|
96
|
+
def generate_sst_uris(self) -> Generator[InputFile, None, None]:
|
97
|
+
"""
|
98
|
+
Generates all SST URIs.
|
99
|
+
|
100
|
+
returns: Generator of InputFile instances for SSTs.
|
101
|
+
"""
|
102
|
+
sst_directories = self.get_sst_scan_directories()
|
103
|
+
for directory in sst_directories:
|
104
|
+
for file in self._file_store.list_files(directory):
|
105
|
+
yield file
|
@@ -0,0 +1,130 @@
|
|
1
|
+
from typing import Tuple, Iterator, Optional
|
2
|
+
from pyarrow.fs import FileSystem, FileType, FileSelector
|
3
|
+
|
4
|
+
# TODO(deltacat): Rely on deltacat implementation to resolve path and filesystem.
|
5
|
+
from ray.data.datasource.path_util import _resolve_paths_and_filesystem
|
6
|
+
|
7
|
+
from deltacat.storage.rivulet.fs.input_file import FSInputFile
|
8
|
+
from deltacat.storage.rivulet.fs.output_file import FSOutputFile
|
9
|
+
|
10
|
+
|
11
|
+
class FileStore:
|
12
|
+
"""
|
13
|
+
Manages the filesystem and low-level file operations.
|
14
|
+
This class is designed to work with any filesystem supported by PyArrow; local, S3, HDFS, GCP,
|
15
|
+
and other fsspec-compatible filesystems.
|
16
|
+
|
17
|
+
TODO: Add better error consolidation between filesystems. Will be handled by deltacat implementation?
|
18
|
+
|
19
|
+
method: `filesystem`: Resolves and normalizes a given path and filesystem.
|
20
|
+
method: `file_exists`: Checks if a file exists at a given URI.
|
21
|
+
method: `create_file`: Creates a new file for writing at a specified URI.
|
22
|
+
method: `read_file`: Reads an existing file from a specified URI.
|
23
|
+
method: `list_files`: Lists all files within a specified directory URI.
|
24
|
+
"""
|
25
|
+
|
26
|
+
def __init__(self, path: str, filesystem: Optional[FileSystem] = None):
|
27
|
+
"""
|
28
|
+
Serves as the source of truth for all file operations, ensuring that
|
29
|
+
all paths and operations are relative to the specified filesystem,
|
30
|
+
providing consistency and compatibility across fsspec supported backends.
|
31
|
+
|
32
|
+
TODO (deltacat): maybe rely on deltacat catalog as a source of truth for rivulet filesystem.
|
33
|
+
|
34
|
+
param: path (str): The base URI or path for the filesystem.
|
35
|
+
param: filesystem (FileSystem): A PyArrow filesystem instance.
|
36
|
+
"""
|
37
|
+
_, fs = FileStore.filesystem(path, filesystem)
|
38
|
+
self.filesystem = filesystem or fs
|
39
|
+
|
40
|
+
@staticmethod
|
41
|
+
def filesystem(
|
42
|
+
path: str, filesystem: Optional[FileSystem] = None
|
43
|
+
) -> Tuple[str, FileSystem]:
|
44
|
+
"""
|
45
|
+
Resolves and normalizes the given path and filesystem.
|
46
|
+
|
47
|
+
param: path (str): The URI or path to resolve.
|
48
|
+
param: filesystem (Optional[FileSystem]): An optional filesystem instance.
|
49
|
+
returns: Tuple[str, FileSystem]: The normalized path and filesystem.
|
50
|
+
raises: AssertionError: If multiple paths are resolved.
|
51
|
+
"""
|
52
|
+
paths, filesystem = _resolve_paths_and_filesystem(
|
53
|
+
paths=path, filesystem=filesystem
|
54
|
+
)
|
55
|
+
assert len(paths) == 1, "Multiple paths not supported"
|
56
|
+
return paths[0], filesystem
|
57
|
+
|
58
|
+
def file_exists(
|
59
|
+
self, data_uri: str, filesystem: Optional[FileSystem] = None
|
60
|
+
) -> bool:
|
61
|
+
"""
|
62
|
+
Checks if a file exists at the specified URI.
|
63
|
+
|
64
|
+
param: data_uri (str): The URI of the file to check.
|
65
|
+
param: filesystem (Optional[FileSystem]): Filesystem to use. Defaults to the instance filesystem.
|
66
|
+
returns: bool: True if the file exists, False otherwise.
|
67
|
+
"""
|
68
|
+
path, filesystem = FileStore.filesystem(data_uri, filesystem or self.filesystem)
|
69
|
+
return filesystem.get_file_info(path).type != FileType.NotFound
|
70
|
+
|
71
|
+
def create_output_file(
|
72
|
+
self, data_uri: str, filesystem: Optional[FileSystem] = None
|
73
|
+
) -> FSOutputFile:
|
74
|
+
"""
|
75
|
+
Creates a new output file for writing at the specified URI.
|
76
|
+
|
77
|
+
param: data_uri (str): The URI where the file will be created.
|
78
|
+
param: filesystem (Optional[FileSystem]): Filesystem to use. Defaults to the instance filesystem.
|
79
|
+
returns: FSOutputFile: An object for writing to the file.
|
80
|
+
raises: IOError: If file creation fails.
|
81
|
+
"""
|
82
|
+
try:
|
83
|
+
path, filesystem = FileStore.filesystem(
|
84
|
+
data_uri, filesystem or self.filesystem
|
85
|
+
)
|
86
|
+
return FSOutputFile(path, filesystem)
|
87
|
+
except Exception as e:
|
88
|
+
raise IOError(f"Failed to create file '{data_uri}': {e}")
|
89
|
+
|
90
|
+
def create_input_file(
|
91
|
+
self, data_uri: str, filesystem: Optional[FileSystem] = None
|
92
|
+
) -> FSInputFile:
|
93
|
+
"""
|
94
|
+
Create a new input file for reading at the specified URI.
|
95
|
+
|
96
|
+
param: data_uri (str): The URI of the file to read.
|
97
|
+
param: filesystem (Optional[FileSystem]): Filesystem to use. Defaults to the instance filesystem.
|
98
|
+
returns: FSInputFile: An object for reading the file.
|
99
|
+
raises: IOError: If file reading fails.
|
100
|
+
"""
|
101
|
+
try:
|
102
|
+
path, filesystem = FileStore.filesystem(
|
103
|
+
data_uri, filesystem or self.filesystem
|
104
|
+
)
|
105
|
+
return FSInputFile(path, filesystem)
|
106
|
+
except Exception as e:
|
107
|
+
raise IOError(f"Failed to read file '{data_uri}': {e}")
|
108
|
+
|
109
|
+
def list_files(
|
110
|
+
self, data_uri: str, filesystem: Optional[FileSystem] = None
|
111
|
+
) -> Iterator[FSInputFile]:
|
112
|
+
"""
|
113
|
+
Lists all files in the specified directory URI.
|
114
|
+
|
115
|
+
param: data_uri (str): The URI of the directory to list files from.
|
116
|
+
param: filesystem (Optional[FileSystem]): Filesystem to use. Defaults to the instance filesystem.
|
117
|
+
returns: Iterator[FSInputFile]: An iterator of FSInputFile objects representing the files.
|
118
|
+
raises: IOError: If listing files fails.
|
119
|
+
"""
|
120
|
+
try:
|
121
|
+
path, filesystem = FileStore.filesystem(
|
122
|
+
data_uri, filesystem or self.filesystem
|
123
|
+
)
|
124
|
+
file_info = filesystem.get_file_info(FileSelector(path, recursive=False))
|
125
|
+
|
126
|
+
for file in file_info:
|
127
|
+
if file.type == FileType.File:
|
128
|
+
yield FSInputFile(file.path, filesystem)
|
129
|
+
except Exception as e:
|
130
|
+
raise IOError(f"Failed to list files in '{data_uri}': {e}")
|
@@ -0,0 +1,76 @@
|
|
1
|
+
from contextlib import contextmanager
|
2
|
+
import io
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from typing import Protocol
|
5
|
+
|
6
|
+
from pyarrow.fs import FileSystem, FileType
|
7
|
+
|
8
|
+
|
9
|
+
class InputStream(Protocol):
|
10
|
+
"""A protocol with a subset of IOBase for file-like input objects"""
|
11
|
+
|
12
|
+
@abstractmethod
|
13
|
+
def read(self, size: int = -1) -> bytes:
|
14
|
+
...
|
15
|
+
|
16
|
+
@abstractmethod
|
17
|
+
def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
|
18
|
+
...
|
19
|
+
|
20
|
+
@abstractmethod
|
21
|
+
def tell(self) -> int:
|
22
|
+
...
|
23
|
+
|
24
|
+
@abstractmethod
|
25
|
+
def close(self) -> None:
|
26
|
+
...
|
27
|
+
|
28
|
+
def __enter__(self) -> "InputStream":
|
29
|
+
...
|
30
|
+
|
31
|
+
@abstractmethod
|
32
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
33
|
+
...
|
34
|
+
|
35
|
+
|
36
|
+
class InputFile(ABC):
|
37
|
+
"""Abstraction for interacting with input files"""
|
38
|
+
|
39
|
+
def __init__(self, location: str):
|
40
|
+
self._location = location
|
41
|
+
|
42
|
+
@property
|
43
|
+
def location(self) -> str:
|
44
|
+
return self._location
|
45
|
+
|
46
|
+
@abstractmethod
|
47
|
+
def exists(self) -> bool:
|
48
|
+
"""Return whether the location exists.
|
49
|
+
|
50
|
+
Raises:
|
51
|
+
PermissionError: If this has insufficient permissions to access the file at location.
|
52
|
+
"""
|
53
|
+
|
54
|
+
@abstractmethod
|
55
|
+
def open(self) -> InputStream:
|
56
|
+
"""Return a file-like object for input
|
57
|
+
|
58
|
+
Raises:
|
59
|
+
FileNotFoundError: If the file does not exist at self.location.
|
60
|
+
PermissionError: If this has insufficient permissions to access the file at location.
|
61
|
+
"""
|
62
|
+
|
63
|
+
|
64
|
+
class FSInputFile(InputFile):
|
65
|
+
def __init__(self, location: str, fs: FileSystem):
|
66
|
+
self._location = location
|
67
|
+
self.fs = fs
|
68
|
+
|
69
|
+
def exists(self) -> bool:
|
70
|
+
file_info = self.fs.get_file_info(self._location)
|
71
|
+
return file_info.type != FileType.NotFound
|
72
|
+
|
73
|
+
@contextmanager
|
74
|
+
def open(self, seekable: bool = False):
|
75
|
+
with self.fs.open_input_file(self._location) as input_stream:
|
76
|
+
yield input_stream
|
@@ -0,0 +1,86 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from contextlib import contextmanager
|
3
|
+
import posixpath
|
4
|
+
from typing import Protocol
|
5
|
+
|
6
|
+
from pyarrow.fs import FileSystem, FileType
|
7
|
+
|
8
|
+
from deltacat.storage.rivulet.fs.input_file import FSInputFile, InputFile
|
9
|
+
|
10
|
+
|
11
|
+
class OutputStream(Protocol): # pragma: no cover
|
12
|
+
"""A protocol with a subset of IOBase for file-like output objects"""
|
13
|
+
|
14
|
+
@abstractmethod
|
15
|
+
def write(self, b: bytes) -> int:
|
16
|
+
...
|
17
|
+
|
18
|
+
@abstractmethod
|
19
|
+
def close(self) -> None:
|
20
|
+
...
|
21
|
+
|
22
|
+
@abstractmethod
|
23
|
+
def __enter__(self) -> "OutputStream":
|
24
|
+
...
|
25
|
+
|
26
|
+
@abstractmethod
|
27
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
28
|
+
...
|
29
|
+
|
30
|
+
|
31
|
+
class OutputFile(ABC):
|
32
|
+
"""Abstraction for interacting with output files"""
|
33
|
+
|
34
|
+
def __init__(self, location: str):
|
35
|
+
self._location = location
|
36
|
+
|
37
|
+
@property
|
38
|
+
def location(self) -> str:
|
39
|
+
return self._location
|
40
|
+
|
41
|
+
@abstractmethod
|
42
|
+
def exists(self) -> bool:
|
43
|
+
"""Return whether the location exists.
|
44
|
+
|
45
|
+
Raises:
|
46
|
+
PermissionError: If this has insufficient permissions to access the file at location.
|
47
|
+
"""
|
48
|
+
|
49
|
+
@abstractmethod
|
50
|
+
def to_input_file(self) -> InputFile:
|
51
|
+
"""Return an InputFile for this output file's location"""
|
52
|
+
|
53
|
+
@abstractmethod
|
54
|
+
def create(self) -> OutputStream:
|
55
|
+
"""Return a file-like object for output
|
56
|
+
|
57
|
+
TODO: overwrite protection (FileExistsError?)
|
58
|
+
Raises:
|
59
|
+
PermissionError: If this has insufficient permissions to access the file at location.
|
60
|
+
"""
|
61
|
+
|
62
|
+
|
63
|
+
class FSOutputFile(OutputFile):
|
64
|
+
def __init__(self, location: str, fs: FileSystem):
|
65
|
+
self._location = location
|
66
|
+
self.fs = fs
|
67
|
+
|
68
|
+
def exists(self) -> bool:
|
69
|
+
file_info = self.fs.get_file_info(self._location)
|
70
|
+
return file_info.type != FileType.NotFound
|
71
|
+
|
72
|
+
def to_input_file(self) -> "FSInputFile":
|
73
|
+
return FSInputFile(self._location, self.fs)
|
74
|
+
|
75
|
+
@contextmanager
|
76
|
+
def create(self):
|
77
|
+
"""Create and open the file for writing."""
|
78
|
+
try:
|
79
|
+
parent_dir = posixpath.dirname(self._location)
|
80
|
+
if parent_dir: # Check if there's a parent directory to create
|
81
|
+
self.fs.create_dir(parent_dir, recursive=True)
|
82
|
+
|
83
|
+
with self.fs.open_output_stream(self._location) as output_stream:
|
84
|
+
yield output_stream
|
85
|
+
except Exception as e:
|
86
|
+
raise IOError(f"Failed to create or write to file '{self._location}': {e}")
|