deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +19 -15
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +62 -5
- deltacat/catalog/main/impl.py +18 -8
- deltacat/catalog/model/catalog.py +111 -73
- deltacat/catalog/model/properties.py +25 -22
- deltacat/compute/jobs/client.py +7 -5
- deltacat/constants.py +1 -2
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
- deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
- deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/model/shard.py +6 -2
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +52 -98
- deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +0 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/types/media.py +3 -3
- deltacat/utils/daft.py +530 -4
- deltacat/utils/export.py +3 -1
- deltacat/utils/url.py +1 -1
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +4 -5
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +120 -100
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- /deltacat/{daft → examples/experimental}/__init__.py +0 -0
- /deltacat/examples/{common → experimental/iceberg}/__init__.py +0 -0
- /deltacat/{examples/iceberg → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/iceberg → experimental/storage}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
- /deltacat/{storage/rivulet/fs → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → experimental/storage/rivulet/fs}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/storage/rivulet/metastore}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/shard}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/writer}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- /deltacat/tests/{storage/rivulet/schema → catalog/data}/__init__.py +0 -0
- /deltacat/tests/{storage/rivulet/writer → catalog/main}/__init__.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
deltacat/compute/jobs/client.py
CHANGED
@@ -70,6 +70,7 @@ def _get_head_node_ip(cluster_cfg: str) -> str:
|
|
70
70
|
check=True,
|
71
71
|
)
|
72
72
|
# the head node IP should be the last line printed to stdout
|
73
|
+
# TODO(pdames): add IPv6 support
|
73
74
|
head_node_ip = proc.stdout.splitlines()[-1]
|
74
75
|
if not re.match(
|
75
76
|
r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
|
@@ -322,12 +323,12 @@ class DeltaCatJobClient(JobSubmissionClient):
|
|
322
323
|
|
323
324
|
def local_job_client(*args, **kwargs) -> DeltaCatJobClient:
|
324
325
|
"""
|
325
|
-
Create a
|
326
|
+
Create a DeltaCAT Job Client that can be used to submit jobs to a local Ray
|
326
327
|
cluster. Initializes Ray if it's not already running.
|
327
328
|
|
328
329
|
Args:
|
329
|
-
*args: Positional arguments to pass to `
|
330
|
-
**kwargs: Keyword arguments to pass to `
|
330
|
+
*args: Positional arguments to pass to `deltacat.init()`.
|
331
|
+
**kwargs: Keyword arguments to pass to `deltacat.init()`.
|
331
332
|
Returns:
|
332
333
|
DeltaCatJobClient: A client instance that can be used to submit and
|
333
334
|
manage local Ray jobs.
|
@@ -338,7 +339,7 @@ def local_job_client(*args, **kwargs) -> DeltaCatJobClient:
|
|
338
339
|
if not dc.is_initialized():
|
339
340
|
context = dc.init(*args, **kwargs)
|
340
341
|
else:
|
341
|
-
context = dc.init(
|
342
|
+
context = dc.init()
|
342
343
|
if context.dashboard_url:
|
343
344
|
head_node_ip, port = context.dashboard_url.split(":")
|
344
345
|
else:
|
@@ -367,7 +368,8 @@ def job_client(
|
|
367
368
|
port: Union[str, int] = "8265",
|
368
369
|
) -> DeltaCatJobClient:
|
369
370
|
"""
|
370
|
-
Create a DeltaCAT Job Client that can be used to submit jobs to a remote
|
371
|
+
Create a DeltaCAT Job Client that can be used to submit jobs to a remote
|
372
|
+
Ray cluster.
|
371
373
|
|
372
374
|
Args:
|
373
375
|
cluster_cfg_file_path: Path to the Ray Cluster Launcher
|
deltacat/constants.py
CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
|
4
4
|
from deltacat.utils.common import env_string, env_bool
|
5
|
-
import os
|
6
5
|
|
7
6
|
# Environment variables
|
8
7
|
DELTACAT_SYS_LOG_LEVEL = env_string("DELTACAT_SYS_LOG_LEVEL", "DEBUG")
|
@@ -40,7 +39,7 @@ DELTACAT_LOGGER_USE_SINGLE_HANDLER = env_bool(
|
|
40
39
|
)
|
41
40
|
DELTACAT_ROOT = env_string(
|
42
41
|
"DELTACAT_ROOT",
|
43
|
-
|
42
|
+
"",
|
44
43
|
)
|
45
44
|
|
46
45
|
# CLI Args
|
deltacat/env.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import argparse
|
1
2
|
import os
|
2
3
|
import logging
|
3
4
|
from typing import Dict, Any
|
@@ -49,3 +50,12 @@ def create_ray_runtime_environment() -> Dict[str, Any]:
|
|
49
50
|
"env_vars": worker_env_vars,
|
50
51
|
}
|
51
52
|
return runtime_environment
|
53
|
+
|
54
|
+
|
55
|
+
def store_cli_args_in_os_environ(script_args_list=[]):
|
56
|
+
parser = argparse.ArgumentParser()
|
57
|
+
for args, kwargs in script_args_list:
|
58
|
+
parser.add_argument(*args, **kwargs)
|
59
|
+
args = parser.parse_args()
|
60
|
+
print(f"Command Line Arguments: {args}")
|
61
|
+
os.environ.update(vars(args))
|
@@ -6,9 +6,7 @@ import ray
|
|
6
6
|
|
7
7
|
from deltacat import logs
|
8
8
|
from deltacat.constants import DELTACAT_APP_LOG_DIR, DELTACAT_SYS_LOG_DIR
|
9
|
-
from
|
10
|
-
store_cli_args_in_os_environ,
|
11
|
-
)
|
9
|
+
from env import store_cli_args_in_os_environ
|
12
10
|
from deltacat.env import create_ray_runtime_environment
|
13
11
|
|
14
12
|
# initialize the driver logger
|
@@ -9,10 +9,8 @@ import deltacat as dc
|
|
9
9
|
|
10
10
|
from deltacat import logs
|
11
11
|
from deltacat import IcebergCatalog
|
12
|
-
from deltacat.catalog.iceberg import IcebergCatalogConfig
|
13
|
-
from
|
14
|
-
store_cli_args_in_os_environ,
|
15
|
-
)
|
12
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
13
|
+
from env import store_cli_args_in_os_environ
|
16
14
|
|
17
15
|
from pyiceberg.schema import (
|
18
16
|
Schema,
|
@@ -23,7 +21,7 @@ from pyiceberg.schema import (
|
|
23
21
|
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
24
22
|
from pyiceberg.transforms import BucketTransform
|
25
23
|
|
26
|
-
from deltacat.storage.iceberg.model import (
|
24
|
+
from deltacat.experimental.storage.iceberg.model import (
|
27
25
|
SchemaMapper,
|
28
26
|
PartitionSchemeMapper,
|
29
27
|
)
|
@@ -4,9 +4,7 @@ import deltacat as dc
|
|
4
4
|
|
5
5
|
from deltacat import logs
|
6
6
|
from deltacat import IcebergCatalog
|
7
|
-
from
|
8
|
-
store_cli_args_in_os_environ,
|
9
|
-
)
|
7
|
+
from env import store_cli_args_in_os_environ
|
10
8
|
|
11
9
|
from pyiceberg.schema import (
|
12
10
|
Schema,
|
@@ -22,7 +20,7 @@ from pyiceberg.transforms import DayTransform, IdentityTransform
|
|
22
20
|
from pyiceberg.table.sorting import SortField, SortOrder
|
23
21
|
|
24
22
|
from deltacat.exceptions import TableAlreadyExistsError
|
25
|
-
from deltacat.storage.iceberg.model import (
|
23
|
+
from deltacat.experimental.storage.iceberg.model import (
|
26
24
|
SchemaMapper,
|
27
25
|
PartitionSchemeMapper,
|
28
26
|
SortSchemeMapper,
|
@@ -59,8 +59,8 @@ def run(
|
|
59
59
|
"use_pyarrow": True, # use the native pyarrow reader
|
60
60
|
},
|
61
61
|
# writer arguments to pass to the default writer (polars)
|
62
|
-
# for the given parquet-based datasink, it accepts the same
|
63
|
-
# arguments as polars.DataFrame.
|
62
|
+
# for the given parquet-based datasink, it generally accepts the same
|
63
|
+
# arguments as polars.DataFrame.write_{dest-type} except for `file`
|
64
64
|
writer_args={
|
65
65
|
"compression": "lz4", # faster compression & decompression
|
66
66
|
# "compression": "zstd", # better compression ratio
|
@@ -64,8 +64,7 @@ def run_sync(
|
|
64
64
|
cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
|
65
65
|
client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
|
66
66
|
job_number = 0
|
67
|
-
while
|
68
|
-
jobs_to_submit -= 1
|
67
|
+
while job_number < jobs_to_submit:
|
69
68
|
job_dest = dest + f".{job_number}"
|
70
69
|
job_run_result = client.run_job(
|
71
70
|
# Entrypoint shell command to execute
|
@@ -15,7 +15,7 @@ class IcebergCatalogConfig:
|
|
15
15
|
|
16
16
|
This configuration is passed through to PyIceberg by invoking load_catalog.
|
17
17
|
The Properties provided must match properties accepted by PyIceberg for each catalog type
|
18
|
-
See: :func:`deltacat.catalog.iceberg.initialize`
|
18
|
+
See: :func:`deltacat.experimental.catalog.iceberg.initialize`
|
19
19
|
|
20
20
|
Attributes:
|
21
21
|
type: The PyIceberg Catalog instance
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import sys
|
2
3
|
|
3
4
|
from typing import Any, Dict, List, Optional, Union
|
4
5
|
|
@@ -7,13 +8,19 @@ from daft.daft import ScanOperatorHandle, StorageConfig
|
|
7
8
|
from daft.logical.builder import LogicalPlanBuilder
|
8
9
|
|
9
10
|
from deltacat import logs
|
11
|
+
from deltacat.catalog.model.catalog import Catalog
|
10
12
|
from deltacat.catalog.model.table_definition import TableDefinition
|
11
|
-
from deltacat.daft
|
13
|
+
from deltacat.utils.daft import DeltaCatScanOperator
|
12
14
|
from deltacat.exceptions import TableAlreadyExistsError
|
13
|
-
from deltacat.storage.iceberg.iceberg_scan_planner import
|
14
|
-
|
15
|
+
from deltacat.experimental.storage.iceberg.iceberg_scan_planner import (
|
16
|
+
IcebergScanPlanner,
|
17
|
+
)
|
18
|
+
from deltacat.experimental.storage.iceberg.model import (
|
19
|
+
PartitionSchemeMapper,
|
20
|
+
SchemaMapper,
|
21
|
+
)
|
15
22
|
from deltacat.storage.model.partition import PartitionScheme
|
16
|
-
from deltacat.storage.iceberg.impl import _get_native_catalog
|
23
|
+
from deltacat.experimental.storage.iceberg.impl import _get_native_catalog
|
17
24
|
from deltacat.storage.model.sort_key import SortScheme
|
18
25
|
from deltacat.storage.model.list_result import ListResult
|
19
26
|
from deltacat.storage.model.namespace import Namespace, NamespaceProperties
|
@@ -26,20 +33,31 @@ from deltacat.storage.model.types import (
|
|
26
33
|
LocalTable,
|
27
34
|
StreamFormat,
|
28
35
|
)
|
29
|
-
from deltacat.storage.iceberg import impl as IcebergStorage
|
36
|
+
from deltacat.experimental.storage.iceberg import impl as IcebergStorage
|
30
37
|
from deltacat.types.media import ContentType
|
31
38
|
from deltacat.types.tables import TableWriteMode
|
32
39
|
from deltacat.constants import DEFAULT_NAMESPACE
|
33
|
-
from deltacat.catalog.iceberg.iceberg_catalog_config import
|
40
|
+
from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
|
41
|
+
IcebergCatalogConfig,
|
42
|
+
)
|
34
43
|
|
35
|
-
from pyiceberg.catalog import Catalog, load_catalog
|
44
|
+
from pyiceberg.catalog import Catalog as PyIcebergCatalog, load_catalog
|
36
45
|
from pyiceberg.transforms import BucketTransform
|
37
46
|
|
38
47
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
39
48
|
|
49
|
+
IcebergCatalog = sys.modules[__name__]
|
50
|
+
|
51
|
+
|
52
|
+
def from_config(config: IcebergCatalogConfig, *args, **kwargs) -> Catalog:
|
53
|
+
"""
|
54
|
+
Factory method to construct a catalog from Iceberg catalog configuration.
|
55
|
+
"""
|
56
|
+
return Catalog(config, impl=IcebergCatalog, *args, **kwargs)
|
57
|
+
|
40
58
|
|
41
59
|
# catalog functions
|
42
|
-
def initialize(
|
60
|
+
def initialize(config: IcebergCatalogConfig, **kwargs) -> PyIcebergCatalog:
|
43
61
|
"""
|
44
62
|
Initializes an Iceberg catalog with the given config.
|
45
63
|
|
@@ -123,7 +141,7 @@ def write_to_table(
|
|
123
141
|
)
|
124
142
|
# TODO(pdames): only append s3:// to output file paths when writing to S3!
|
125
143
|
out_file_paths = [f"s3://{val}" for val in out_df.to_arrow()[0]]
|
126
|
-
from deltacat.catalog.iceberg import overrides
|
144
|
+
from deltacat.experimental.catalog.iceberg import overrides
|
127
145
|
|
128
146
|
overrides.append(
|
129
147
|
table_definition.table.native_object,
|
@@ -5,7 +5,7 @@ from deltacat.storage.model.scan.push_down import Pushdown
|
|
5
5
|
from deltacat.storage.model.scan.scan_plan import ScanPlan
|
6
6
|
from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
|
7
7
|
from deltacat.storage.util.scan_planner import ScanPlanner
|
8
|
-
from deltacat.storage.iceberg.impl import _try_load_iceberg_table
|
8
|
+
from deltacat.experimental.storage.iceberg.impl import _try_load_iceberg_table
|
9
9
|
|
10
10
|
|
11
11
|
class IcebergScanPlanner(ScanPlanner):
|
@@ -32,7 +32,7 @@ from deltacat.storage import (
|
|
32
32
|
NamespaceProperties,
|
33
33
|
)
|
34
34
|
from deltacat.storage.model.manifest import Manifest
|
35
|
-
from deltacat.storage.iceberg.model import (
|
35
|
+
from deltacat.experimental.storage.iceberg.model import (
|
36
36
|
SchemaMapper,
|
37
37
|
PartitionSchemeMapper,
|
38
38
|
SortSchemeMapper,
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
2
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Field
|
3
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
4
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Datatype
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"Schema",
|
8
|
+
"Field",
|
9
|
+
"Dataset",
|
10
|
+
"Datatype",
|
11
|
+
]
|
@@ -2,10 +2,13 @@ from abc import ABC, abstractmethod
|
|
2
2
|
from typing import Iterator, List, Any
|
3
3
|
import pyarrow as pa
|
4
4
|
|
5
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
6
|
-
from deltacat.storage.rivulet import Schema
|
7
|
-
from deltacat.storage.rivulet.serializer import
|
8
|
-
|
5
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
6
|
+
from deltacat.experimental.storage.rivulet import Schema
|
7
|
+
from deltacat.experimental.storage.rivulet.serializer import (
|
8
|
+
DataSerializer,
|
9
|
+
MEMTABLE_DATA,
|
10
|
+
)
|
11
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
9
12
|
|
10
13
|
|
11
14
|
class ArrowSerializer(DataSerializer, ABC):
|
@@ -24,19 +24,23 @@ from deltacat.storage.model.shard import Shard, ShardingStrategy
|
|
24
24
|
from deltacat.storage.model.stream import Stream, StreamLocator
|
25
25
|
from deltacat.storage.model.transaction import TransactionOperationList
|
26
26
|
from deltacat.storage.model.types import CommitState, StreamFormat
|
27
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
28
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
29
|
-
from deltacat.storage.rivulet.reader.dataset_metastore import
|
30
|
-
|
27
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
28
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
29
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
|
30
|
+
DatasetMetastore,
|
31
|
+
)
|
32
|
+
from deltacat.experimental.storage.rivulet import Schema, Field
|
31
33
|
from deltacat.utils.export import export_dataset
|
32
34
|
from .schema.schema import Datatype
|
33
35
|
|
34
|
-
from deltacat.storage.rivulet.reader.data_scan import DataScan
|
35
|
-
from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
|
36
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
36
|
+
from deltacat.experimental.storage.rivulet.reader.data_scan import DataScan
|
37
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
|
38
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
39
|
+
QueryExpression,
|
40
|
+
)
|
37
41
|
|
38
|
-
from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter
|
39
|
-
from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
|
42
|
+
from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
|
43
|
+
from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
|
40
44
|
MemtableDatasetWriter,
|
41
45
|
)
|
42
46
|
|
@@ -2,13 +2,16 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from typing import List, Callable, Any
|
4
4
|
|
5
|
-
from deltacat.storage.rivulet.
|
6
|
-
from deltacat.storage.rivulet
|
7
|
-
from deltacat.storage.rivulet import
|
8
|
-
from deltacat.storage.rivulet.reader.
|
9
|
-
|
10
|
-
|
11
|
-
from deltacat.storage.rivulet.reader.
|
5
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
|
6
|
+
from deltacat.experimental.storage.rivulet import Schema
|
7
|
+
from deltacat.experimental.storage.rivulet.reader.data_scan import DataScan
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
|
9
|
+
DatasetMetastore,
|
10
|
+
)
|
11
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
|
12
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
13
|
+
QueryExpression,
|
14
|
+
)
|
12
15
|
|
13
16
|
|
14
17
|
class DatasetExecutor:
|
@@ -22,12 +25,10 @@ class DatasetExecutor:
|
|
22
25
|
|
23
26
|
def __init__(
|
24
27
|
self,
|
25
|
-
field_groups: List[FieldGroup],
|
26
28
|
schema: Schema,
|
27
29
|
metastore: DatasetMetastore,
|
28
30
|
):
|
29
31
|
self.effective_schema: Schema = schema.__deepcopy__()
|
30
|
-
self.field_groups = field_groups
|
31
32
|
self.output: MvpTable | None = None
|
32
33
|
self._metastore = metastore
|
33
34
|
|
@@ -64,18 +65,9 @@ class DatasetExecutor:
|
|
64
65
|
|
65
66
|
TODO for now this is doing dumb in-memory implementation and later this is going to be replaced by rust library
|
66
67
|
"""
|
67
|
-
|
68
|
-
return self._read_as_mvp_table(schema, self.field_groups[0])
|
69
|
-
else:
|
70
|
-
ds1 = self._read_as_mvp_table(schema, self.field_groups[0])
|
71
|
-
ds2 = self._read_as_mvp_table(schema, self.field_groups[1])
|
72
|
-
merged = MvpTable.merge(ds1, ds2, schema.primary_key.name)
|
73
|
-
for i in range(2, len(self.field_groups)):
|
74
|
-
ds_i = self._read_as_mvp_table(schema, self.field_groups[i])
|
75
|
-
merged = MvpTable.merge(merged, ds_i, schema.primary_key.name)
|
76
|
-
return merged
|
68
|
+
return self._read_as_mvp_table(schema)
|
77
69
|
|
78
|
-
def _read_as_mvp_table(self, schema: Schema
|
70
|
+
def _read_as_mvp_table(self, schema: Schema):
|
79
71
|
data = list(
|
80
72
|
DataScan(
|
81
73
|
schema, QueryExpression(), DatasetReader(self._metastore)
|
@@ -0,0 +1,7 @@
|
|
1
|
+
# TODO later on this will be moved to a dedicated package
|
2
|
+
from deltacat.experimental.storage.rivulet.feather.file_reader import FeatherFileReader
|
3
|
+
from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
|
4
|
+
FileReaderRegistrar,
|
5
|
+
)
|
6
|
+
|
7
|
+
FileReaderRegistrar.register_reader("feather", FeatherFileReader)
|
@@ -5,15 +5,17 @@ from typing import Optional
|
|
5
5
|
import pyarrow.ipc
|
6
6
|
from pyarrow import RecordBatch, RecordBatchFileReader
|
7
7
|
|
8
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
9
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
10
|
-
from deltacat.storage.rivulet.reader.data_reader import (
|
8
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
9
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
10
|
+
from deltacat.experimental.storage.rivulet.reader.data_reader import (
|
11
11
|
RowAndKey,
|
12
12
|
FileReader,
|
13
13
|
FILE_FORMAT,
|
14
14
|
)
|
15
|
-
from deltacat.storage.rivulet.reader.pyarrow_data_reader import
|
16
|
-
|
15
|
+
from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
|
16
|
+
RecordBatchRowIndex,
|
17
|
+
)
|
18
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
17
19
|
|
18
20
|
|
19
21
|
class FeatherFileReader(FileReader[RecordBatchRowIndex]):
|
@@ -3,10 +3,10 @@ from typing import List
|
|
3
3
|
import pyarrow as pa
|
4
4
|
from pyarrow import feather
|
5
5
|
|
6
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
7
|
-
from deltacat.storage.rivulet import Schema
|
8
|
-
from deltacat.storage.rivulet.arrow.serializer import ArrowSerializer
|
9
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
6
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
7
|
+
from deltacat.experimental.storage.rivulet import Schema
|
8
|
+
from deltacat.experimental.storage.rivulet.arrow.serializer import ArrowSerializer
|
9
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
10
10
|
|
11
11
|
|
12
12
|
class FeatherDataSerializer(ArrowSerializer):
|
@@ -3,9 +3,9 @@ import time
|
|
3
3
|
from typing import List, Generator
|
4
4
|
|
5
5
|
from deltacat.storage.model.partition import PartitionLocator
|
6
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
7
|
-
from deltacat.storage.rivulet.fs.input_file import InputFile
|
8
|
-
from deltacat.storage.rivulet.fs.output_file import OutputFile
|
6
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
7
|
+
from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
|
8
|
+
from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
|
9
9
|
from deltacat.utils.metafile_locator import _find_partition_path
|
10
10
|
|
11
11
|
|
@@ -4,8 +4,8 @@ from pyarrow.fs import FileSystem, FileType, FileSelector
|
|
4
4
|
# TODO(deltacat): Rely on deltacat implementation to resolve path and filesystem.
|
5
5
|
from ray.data.datasource.path_util import _resolve_paths_and_filesystem
|
6
6
|
|
7
|
-
from deltacat.storage.rivulet.fs.input_file import FSInputFile
|
8
|
-
from deltacat.storage.rivulet.fs.output_file import FSOutputFile
|
7
|
+
from deltacat.experimental.storage.rivulet.fs.input_file import FSInputFile
|
8
|
+
from deltacat.experimental.storage.rivulet.fs.output_file import FSOutputFile
|
9
9
|
|
10
10
|
|
11
11
|
class FileStore:
|
@@ -5,7 +5,7 @@ from typing import Protocol
|
|
5
5
|
|
6
6
|
from pyarrow.fs import FileSystem, FileType
|
7
7
|
|
8
|
-
from deltacat.storage.rivulet.fs.input_file import FSInputFile, InputFile
|
8
|
+
from deltacat.experimental.storage.rivulet.fs.input_file import FSInputFile, InputFile
|
9
9
|
|
10
10
|
|
11
11
|
class OutputStream(Protocol): # pragma: no cover
|
@@ -1,9 +1,9 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
2
|
from typing import List, Callable, Any, Protocol
|
3
3
|
|
4
|
-
from deltacat.storage.rivulet.dataset_executor import DatasetExecutor
|
5
|
-
from deltacat.storage.rivulet
|
6
|
-
from deltacat.storage.rivulet import
|
4
|
+
from deltacat.experimental.storage.rivulet.dataset_executor import DatasetExecutor
|
5
|
+
from deltacat.experimental.storage.rivulet import Schema
|
6
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
|
7
7
|
|
8
8
|
|
9
9
|
class DatasetOperation(Protocol):
|
@@ -99,7 +99,7 @@ class LogicalPlan:
|
|
99
99
|
self.operations.append(CollectOperation())
|
100
100
|
return self
|
101
101
|
|
102
|
-
def execute(self, executor: DatasetExecutor) ->
|
102
|
+
def execute(self, executor: DatasetExecutor) -> MvpTable:
|
103
103
|
for operation in self.operations:
|
104
104
|
operation.visit(executor)
|
105
105
|
return executor.output
|
@@ -19,7 +19,7 @@ from deltacat.storage.model.partition import PartitionLocator
|
|
19
19
|
from deltacat.storage.model.transaction import TransactionOperationList
|
20
20
|
|
21
21
|
from deltacat.storage.model.types import StreamFormat
|
22
|
-
from deltacat.storage.rivulet import Schema
|
22
|
+
from deltacat.experimental.storage.rivulet import Schema
|
23
23
|
|
24
24
|
StreamPosition = int
|
25
25
|
"""The stream position for creating a consistent ordering of manifests."""
|
@@ -4,9 +4,9 @@ import json
|
|
4
4
|
from itertools import zip_longest
|
5
5
|
from typing import List
|
6
6
|
|
7
|
-
from deltacat.storage.rivulet.fs.input_file import InputFile
|
8
|
-
from deltacat.storage.rivulet.fs.output_file import OutputFile
|
9
|
-
from deltacat.storage.rivulet.metastore.sst import (
|
7
|
+
from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
|
8
|
+
from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
|
9
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import (
|
10
10
|
SSTWriter,
|
11
11
|
SSTableRow,
|
12
12
|
SSTReader,
|
@@ -1,8 +1,8 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
2
|
from typing import Protocol, Any, List
|
3
3
|
|
4
|
-
from deltacat.storage.rivulet.fs.input_file import InputFile
|
5
|
-
from deltacat.storage.rivulet.fs.output_file import OutputFile
|
4
|
+
from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
|
5
|
+
from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
|
6
6
|
|
7
7
|
|
8
8
|
@dataclass(frozen=True)
|
@@ -8,9 +8,9 @@ from typing import Any, Dict, Set, List, FrozenSet, Iterable, TypeVar, NamedTupl
|
|
8
8
|
|
9
9
|
from intervaltree import Interval, IntervalTree
|
10
10
|
|
11
|
-
from deltacat.storage.rivulet.metastore.delta import DeltaContext
|
12
|
-
from deltacat.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
13
|
-
from deltacat.storage.rivulet import Schema
|
11
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
|
12
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
13
|
+
from deltacat.experimental.storage.rivulet import Schema
|
14
14
|
|
15
15
|
T = TypeVar("T")
|
16
16
|
|
@@ -0,0 +1,7 @@
|
|
1
|
+
# TODO later on this will be moved to a dedicated package
|
2
|
+
from deltacat.experimental.storage.rivulet.parquet.file_reader import ParquetFileReader
|
3
|
+
from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
|
4
|
+
FileReaderRegistrar,
|
5
|
+
)
|
6
|
+
|
7
|
+
FileReaderRegistrar.register_reader("parquet", ParquetFileReader)
|
@@ -4,15 +4,17 @@ from typing import Optional
|
|
4
4
|
|
5
5
|
from pyarrow import RecordBatch
|
6
6
|
|
7
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
8
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
9
|
-
from deltacat.storage.rivulet.reader.data_reader import (
|
7
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
8
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
9
|
+
from deltacat.experimental.storage.rivulet.reader.data_reader import (
|
10
10
|
RowAndKey,
|
11
11
|
FileReader,
|
12
12
|
FILE_FORMAT,
|
13
13
|
)
|
14
|
-
from deltacat.storage.rivulet.reader.pyarrow_data_reader import
|
15
|
-
|
14
|
+
from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
|
15
|
+
RecordBatchRowIndex,
|
16
|
+
)
|
17
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
16
18
|
import pyarrow.parquet as pq
|
17
19
|
import pyarrow as pa
|
18
20
|
|
@@ -3,11 +3,11 @@ from typing import List, Any
|
|
3
3
|
import pyarrow as pa
|
4
4
|
from pyarrow.parquet import FileMetaData
|
5
5
|
|
6
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
7
|
-
from deltacat.storage.rivulet import Schema
|
8
|
-
from deltacat.storage.rivulet.arrow.serializer import ArrowSerializer
|
6
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
7
|
+
from deltacat.experimental.storage.rivulet import Schema
|
8
|
+
from deltacat.experimental.storage.rivulet.arrow.serializer import ArrowSerializer
|
9
9
|
|
10
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
10
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
11
11
|
|
12
12
|
|
13
13
|
class ParquetDataSerializer(ArrowSerializer):
|
@@ -15,19 +15,30 @@ from typing import (
|
|
15
15
|
AbstractSet,
|
16
16
|
)
|
17
17
|
|
18
|
-
from deltacat.storage.rivulet.metastore.delta import DeltaContext
|
19
|
-
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
20
|
-
from deltacat.storage.rivulet.metastore.sst_interval_tree import (
|
18
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
|
19
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
|
20
|
+
from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
|
21
21
|
OrderedBlockGroups,
|
22
22
|
BlockGroup,
|
23
23
|
Block,
|
24
24
|
)
|
25
|
-
from deltacat.storage.rivulet.reader.data_reader import
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
from deltacat.storage.rivulet.reader.
|
30
|
-
|
25
|
+
from deltacat.experimental.storage.rivulet.reader.data_reader import (
|
26
|
+
RowAndKey,
|
27
|
+
FileReader,
|
28
|
+
)
|
29
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
|
30
|
+
DatasetMetastore,
|
31
|
+
)
|
32
|
+
from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
|
33
|
+
ArrowDataReader,
|
34
|
+
)
|
35
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
36
|
+
QueryExpression,
|
37
|
+
)
|
38
|
+
from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
|
39
|
+
FileReaderRegistrar,
|
40
|
+
)
|
41
|
+
from deltacat.experimental.storage.rivulet import Schema
|
31
42
|
from deltacat import logs
|
32
43
|
|
33
44
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|