deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. deltacat/__init__.py +19 -15
  2. deltacat/benchmarking/benchmark_engine.py +4 -2
  3. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  4. deltacat/catalog/__init__.py +62 -5
  5. deltacat/catalog/main/impl.py +18 -8
  6. deltacat/catalog/model/catalog.py +111 -73
  7. deltacat/catalog/model/properties.py +25 -22
  8. deltacat/compute/jobs/client.py +7 -5
  9. deltacat/constants.py +1 -2
  10. deltacat/env.py +10 -0
  11. deltacat/examples/basic_logging.py +1 -3
  12. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  13. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  14. deltacat/examples/indexer/indexer.py +2 -2
  15. deltacat/examples/indexer/job_runner.py +1 -2
  16. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  17. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  18. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  19. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  20. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  21. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  22. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  23. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  24. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  25. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  26. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  27. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  28. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  29. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  30. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  31. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  32. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  33. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  34. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  35. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  36. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  37. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  38. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  39. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  40. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  41. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  42. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  43. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  44. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  45. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  46. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  47. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  49. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  50. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  51. deltacat/io/reader/deltacat_read_api.py +1 -1
  52. deltacat/storage/model/shard.py +6 -2
  53. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  54. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  55. deltacat/tests/catalog/model/__init__.py +0 -0
  56. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  57. deltacat/tests/catalog/test_catalogs.py +52 -98
  58. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  59. deltacat/tests/daft/__init__.py +0 -0
  60. deltacat/tests/daft/test_model.py +97 -0
  61. deltacat/tests/experimental/__init__.py +0 -0
  62. deltacat/tests/experimental/catalog/__init__.py +0 -0
  63. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  64. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  65. deltacat/tests/experimental/daft/__init__.py +0 -0
  66. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  67. deltacat/tests/experimental/storage/__init__.py +0 -0
  68. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  69. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  70. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  71. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  72. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  73. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  74. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  75. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  76. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  77. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  78. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  79. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  80. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  81. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  82. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  83. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  84. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  85. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  86. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  87. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  88. deltacat/tests/storage/model/test_shard.py +3 -1
  89. deltacat/types/media.py +3 -3
  90. deltacat/utils/daft.py +530 -4
  91. deltacat/utils/export.py +3 -1
  92. deltacat/utils/url.py +1 -1
  93. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +4 -5
  94. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +120 -100
  95. deltacat/catalog/iceberg/__init__.py +0 -4
  96. deltacat/daft/daft_scan.py +0 -115
  97. deltacat/daft/model.py +0 -258
  98. deltacat/daft/translator.py +0 -126
  99. deltacat/examples/common/fixtures.py +0 -15
  100. deltacat/storage/rivulet/__init__.py +0 -11
  101. deltacat/storage/rivulet/feather/__init__.py +0 -5
  102. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  103. /deltacat/{daft → examples/experimental}/__init__.py +0 -0
  104. /deltacat/examples/{common → experimental/iceberg}/__init__.py +0 -0
  105. /deltacat/{examples/iceberg → experimental/catalog}/__init__.py +0 -0
  106. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  107. /deltacat/{storage/iceberg → experimental/storage}/__init__.py +0 -0
  108. /deltacat/{storage/rivulet/arrow → experimental/storage/iceberg}/__init__.py +0 -0
  109. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  110. /deltacat/{storage/rivulet/fs → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  111. /deltacat/{storage/rivulet/metastore → experimental/storage/rivulet/fs}/__init__.py +0 -0
  112. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  113. /deltacat/{storage/rivulet/reader → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  114. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  115. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  116. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  117. /deltacat/{storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  118. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  119. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  120. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  121. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/shard}/__init__.py +0 -0
  122. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/writer}/__init__.py +0 -0
  123. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  124. /deltacat/tests/{storage/rivulet/schema → catalog/data}/__init__.py +0 -0
  125. /deltacat/tests/{storage/rivulet/writer → catalog/main}/__init__.py +0 -0
  126. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  127. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  128. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -70,6 +70,7 @@ def _get_head_node_ip(cluster_cfg: str) -> str:
70
70
  check=True,
71
71
  )
72
72
  # the head node IP should be the last line printed to stdout
73
+ # TODO(pdames): add IPv6 support
73
74
  head_node_ip = proc.stdout.splitlines()[-1]
74
75
  if not re.match(
75
76
  r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
@@ -322,12 +323,12 @@ class DeltaCatJobClient(JobSubmissionClient):
322
323
 
323
324
  def local_job_client(*args, **kwargs) -> DeltaCatJobClient:
324
325
  """
325
- Create a Ray Job Client that can be used to submit jobs to a local Ray
326
+ Create a DeltaCAT Job Client that can be used to submit jobs to a local Ray
326
327
  cluster. Initializes Ray if it's not already running.
327
328
 
328
329
  Args:
329
- *args: Positional arguments to pass to `ray.init()`.
330
- **kwargs: Keyword arguments to pass to `ray.init()`.
330
+ *args: Positional arguments to pass to `deltacat.init()`.
331
+ **kwargs: Keyword arguments to pass to `deltacat.init()`.
331
332
  Returns:
332
333
  DeltaCatJobClient: A client instance that can be used to submit and
333
334
  manage local Ray jobs.
@@ -338,7 +339,7 @@ def local_job_client(*args, **kwargs) -> DeltaCatJobClient:
338
339
  if not dc.is_initialized():
339
340
  context = dc.init(*args, **kwargs)
340
341
  else:
341
- context = dc.init(ray_init_args={"ignore_reinit_error": True})
342
+ context = dc.init()
342
343
  if context.dashboard_url:
343
344
  head_node_ip, port = context.dashboard_url.split(":")
344
345
  else:
@@ -367,7 +368,8 @@ def job_client(
367
368
  port: Union[str, int] = "8265",
368
369
  ) -> DeltaCatJobClient:
369
370
  """
370
- Create a DeltaCAT Job Client that can be used to submit jobs to a remote Ray cluster.
371
+ Create a DeltaCAT Job Client that can be used to submit jobs to a remote
372
+ Ray cluster.
371
373
 
372
374
  Args:
373
375
  cluster_cfg_file_path: Path to the Ray Cluster Launcher
deltacat/constants.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
 
4
4
  from deltacat.utils.common import env_string, env_bool
5
- import os
6
5
 
7
6
  # Environment variables
8
7
  DELTACAT_SYS_LOG_LEVEL = env_string("DELTACAT_SYS_LOG_LEVEL", "DEBUG")
@@ -40,7 +39,7 @@ DELTACAT_LOGGER_USE_SINGLE_HANDLER = env_bool(
40
39
  )
41
40
  DELTACAT_ROOT = env_string(
42
41
  "DELTACAT_ROOT",
43
- os.path.join(os.getcwd(), ".deltacat"),
42
+ "",
44
43
  )
45
44
 
46
45
  # CLI Args
deltacat/env.py CHANGED
@@ -1,3 +1,4 @@
1
+ import argparse
1
2
  import os
2
3
  import logging
3
4
  from typing import Dict, Any
@@ -49,3 +50,12 @@ def create_ray_runtime_environment() -> Dict[str, Any]:
49
50
  "env_vars": worker_env_vars,
50
51
  }
51
52
  return runtime_environment
53
+
54
+
55
+ def store_cli_args_in_os_environ(script_args_list=[]):
56
+ parser = argparse.ArgumentParser()
57
+ for args, kwargs in script_args_list:
58
+ parser.add_argument(*args, **kwargs)
59
+ args = parser.parse_args()
60
+ print(f"Command Line Arguments: {args}")
61
+ os.environ.update(vars(args))
@@ -6,9 +6,7 @@ import ray
6
6
 
7
7
  from deltacat import logs
8
8
  from deltacat.constants import DELTACAT_APP_LOG_DIR, DELTACAT_SYS_LOG_DIR
9
- from deltacat.examples.common.fixtures import (
10
- store_cli_args_in_os_environ,
11
- )
9
+ from env import store_cli_args_in_os_environ
12
10
  from deltacat.env import create_ray_runtime_environment
13
11
 
14
12
  # initialize the driver logger
@@ -9,10 +9,8 @@ import deltacat as dc
9
9
 
10
10
  from deltacat import logs
11
11
  from deltacat import IcebergCatalog
12
- from deltacat.catalog.iceberg import IcebergCatalogConfig
13
- from deltacat.examples.common.fixtures import (
14
- store_cli_args_in_os_environ,
15
- )
12
+ from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
13
+ from env import store_cli_args_in_os_environ
16
14
 
17
15
  from pyiceberg.schema import (
18
16
  Schema,
@@ -23,7 +21,7 @@ from pyiceberg.schema import (
23
21
  from pyiceberg.partitioning import PartitionSpec, PartitionField
24
22
  from pyiceberg.transforms import BucketTransform
25
23
 
26
- from deltacat.storage.iceberg.model import (
24
+ from deltacat.experimental.storage.iceberg.model import (
27
25
  SchemaMapper,
28
26
  PartitionSchemeMapper,
29
27
  )
@@ -4,9 +4,7 @@ import deltacat as dc
4
4
 
5
5
  from deltacat import logs
6
6
  from deltacat import IcebergCatalog
7
- from deltacat.examples.common.fixtures import (
8
- store_cli_args_in_os_environ,
9
- )
7
+ from env import store_cli_args_in_os_environ
10
8
 
11
9
  from pyiceberg.schema import (
12
10
  Schema,
@@ -22,7 +20,7 @@ from pyiceberg.transforms import DayTransform, IdentityTransform
22
20
  from pyiceberg.table.sorting import SortField, SortOrder
23
21
 
24
22
  from deltacat.exceptions import TableAlreadyExistsError
25
- from deltacat.storage.iceberg.model import (
23
+ from deltacat.experimental.storage.iceberg.model import (
26
24
  SchemaMapper,
27
25
  PartitionSchemeMapper,
28
26
  SortSchemeMapper,
@@ -59,8 +59,8 @@ def run(
59
59
  "use_pyarrow": True, # use the native pyarrow reader
60
60
  },
61
61
  # writer arguments to pass to the default writer (polars)
62
- # for the given parquet-based datasink, it accepts the same
63
- # arguments as polars.DataFrame.write_parquet except for `file`
62
+ # for the given parquet-based datasink, it generally accepts the same
63
+ # arguments as polars.DataFrame.write_{dest-type} except for `file`
64
64
  writer_args={
65
65
  "compression": "lz4", # faster compression & decompression
66
66
  # "compression": "zstd", # better compression ratio
@@ -64,8 +64,7 @@ def run_sync(
64
64
  cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
65
65
  client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
66
66
  job_number = 0
67
- while jobs_to_submit > 0:
68
- jobs_to_submit -= 1
67
+ while job_number < jobs_to_submit:
69
68
  job_dest = dest + f".{job_number}"
70
69
  job_run_result = client.run_job(
71
70
  # Entrypoint shell command to execute
@@ -0,0 +1,6 @@
1
+ from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
2
+ IcebergCatalogConfig,
3
+ )
4
+ import deltacat.experimental.catalog.iceberg.impl as IcebergCatalog
5
+
6
+ __all__ = ["IcebergCatalogConfig", "IcebergCatalog"]
@@ -15,7 +15,7 @@ class IcebergCatalogConfig:
15
15
 
16
16
  This configuration is passed through to PyIceberg by invoking load_catalog.
17
17
  The Properties provided must match properties accepted by PyIceberg for each catalog type
18
- See: :func:`deltacat.catalog.iceberg.initialize`
18
+ See: :func:`deltacat.experimental.catalog.iceberg.initialize`
19
19
 
20
20
  Attributes:
21
21
  type: The PyIceberg Catalog instance
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import sys
2
3
 
3
4
  from typing import Any, Dict, List, Optional, Union
4
5
 
@@ -7,13 +8,19 @@ from daft.daft import ScanOperatorHandle, StorageConfig
7
8
  from daft.logical.builder import LogicalPlanBuilder
8
9
 
9
10
  from deltacat import logs
11
+ from deltacat.catalog.model.catalog import Catalog
10
12
  from deltacat.catalog.model.table_definition import TableDefinition
11
- from deltacat.daft.daft_scan import DeltaCatScanOperator
13
+ from deltacat.utils.daft import DeltaCatScanOperator
12
14
  from deltacat.exceptions import TableAlreadyExistsError
13
- from deltacat.storage.iceberg.iceberg_scan_planner import IcebergScanPlanner
14
- from deltacat.storage.iceberg.model import PartitionSchemeMapper, SchemaMapper
15
+ from deltacat.experimental.storage.iceberg.iceberg_scan_planner import (
16
+ IcebergScanPlanner,
17
+ )
18
+ from deltacat.experimental.storage.iceberg.model import (
19
+ PartitionSchemeMapper,
20
+ SchemaMapper,
21
+ )
15
22
  from deltacat.storage.model.partition import PartitionScheme
16
- from deltacat.storage.iceberg.impl import _get_native_catalog
23
+ from deltacat.experimental.storage.iceberg.impl import _get_native_catalog
17
24
  from deltacat.storage.model.sort_key import SortScheme
18
25
  from deltacat.storage.model.list_result import ListResult
19
26
  from deltacat.storage.model.namespace import Namespace, NamespaceProperties
@@ -26,20 +33,31 @@ from deltacat.storage.model.types import (
26
33
  LocalTable,
27
34
  StreamFormat,
28
35
  )
29
- from deltacat.storage.iceberg import impl as IcebergStorage
36
+ from deltacat.experimental.storage.iceberg import impl as IcebergStorage
30
37
  from deltacat.types.media import ContentType
31
38
  from deltacat.types.tables import TableWriteMode
32
39
  from deltacat.constants import DEFAULT_NAMESPACE
33
- from deltacat.catalog.iceberg.iceberg_catalog_config import IcebergCatalogConfig
40
+ from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
41
+ IcebergCatalogConfig,
42
+ )
34
43
 
35
- from pyiceberg.catalog import Catalog, load_catalog
44
+ from pyiceberg.catalog import Catalog as PyIcebergCatalog, load_catalog
36
45
  from pyiceberg.transforms import BucketTransform
37
46
 
38
47
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
39
48
 
49
+ IcebergCatalog = sys.modules[__name__]
50
+
51
+
52
+ def from_config(config: IcebergCatalogConfig, *args, **kwargs) -> Catalog:
53
+ """
54
+ Factory method to construct a catalog from Iceberg catalog configuration.
55
+ """
56
+ return Catalog(config, impl=IcebergCatalog, *args, **kwargs)
57
+
40
58
 
41
59
  # catalog functions
42
- def initialize(*args, config: IcebergCatalogConfig, **kwargs) -> Catalog:
60
+ def initialize(config: IcebergCatalogConfig, **kwargs) -> PyIcebergCatalog:
43
61
  """
44
62
  Initializes an Iceberg catalog with the given config.
45
63
 
@@ -123,7 +141,7 @@ def write_to_table(
123
141
  )
124
142
  # TODO(pdames): only append s3:// to output file paths when writing to S3!
125
143
  out_file_paths = [f"s3://{val}" for val in out_df.to_arrow()[0]]
126
- from deltacat.catalog.iceberg import overrides
144
+ from deltacat.experimental.catalog.iceberg import overrides
127
145
 
128
146
  overrides.append(
129
147
  table_definition.table.native_object,
@@ -5,7 +5,7 @@ from deltacat.storage.model.scan.push_down import Pushdown
5
5
  from deltacat.storage.model.scan.scan_plan import ScanPlan
6
6
  from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
7
7
  from deltacat.storage.util.scan_planner import ScanPlanner
8
- from deltacat.storage.iceberg.impl import _try_load_iceberg_table
8
+ from deltacat.experimental.storage.iceberg.impl import _try_load_iceberg_table
9
9
 
10
10
 
11
11
  class IcebergScanPlanner(ScanPlanner):
@@ -32,7 +32,7 @@ from deltacat.storage import (
32
32
  NamespaceProperties,
33
33
  )
34
34
  from deltacat.storage.model.manifest import Manifest
35
- from deltacat.storage.iceberg.model import (
35
+ from deltacat.experimental.storage.iceberg.model import (
36
36
  SchemaMapper,
37
37
  PartitionSchemeMapper,
38
38
  SortSchemeMapper,
@@ -0,0 +1,11 @@
1
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
2
+ from deltacat.experimental.storage.rivulet.schema.schema import Field
3
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
4
+ from deltacat.experimental.storage.rivulet.schema.schema import Datatype
5
+
6
+ __all__ = [
7
+ "Schema",
8
+ "Field",
9
+ "Dataset",
10
+ "Datatype",
11
+ ]
@@ -2,10 +2,13 @@ from abc import ABC, abstractmethod
2
2
  from typing import Iterator, List, Any
3
3
  import pyarrow as pa
4
4
 
5
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
6
- from deltacat.storage.rivulet import Schema
7
- from deltacat.storage.rivulet.serializer import DataSerializer, MEMTABLE_DATA
8
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
5
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
6
+ from deltacat.experimental.storage.rivulet import Schema
7
+ from deltacat.experimental.storage.rivulet.serializer import (
8
+ DataSerializer,
9
+ MEMTABLE_DATA,
10
+ )
11
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
9
12
 
10
13
 
11
14
  class ArrowSerializer(DataSerializer, ABC):
@@ -24,19 +24,23 @@ from deltacat.storage.model.shard import Shard, ShardingStrategy
24
24
  from deltacat.storage.model.stream import Stream, StreamLocator
25
25
  from deltacat.storage.model.transaction import TransactionOperationList
26
26
  from deltacat.storage.model.types import CommitState, StreamFormat
27
- from deltacat.storage.rivulet.fs.file_store import FileStore
28
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
29
- from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
30
- from deltacat.storage.rivulet import Schema, Field
27
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
28
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
29
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
30
+ DatasetMetastore,
31
+ )
32
+ from deltacat.experimental.storage.rivulet import Schema, Field
31
33
  from deltacat.utils.export import export_dataset
32
34
  from .schema.schema import Datatype
33
35
 
34
- from deltacat.storage.rivulet.reader.data_scan import DataScan
35
- from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
36
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
36
+ from deltacat.experimental.storage.rivulet.reader.data_scan import DataScan
37
+ from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
38
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
39
+ QueryExpression,
40
+ )
37
41
 
38
- from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter
39
- from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
42
+ from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
43
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
40
44
  MemtableDatasetWriter,
41
45
  )
42
46
 
@@ -2,13 +2,16 @@ from __future__ import annotations
2
2
 
3
3
  from typing import List, Callable, Any
4
4
 
5
- from deltacat.storage.rivulet.field_group import FieldGroup
6
- from deltacat.storage.rivulet.mvp.Table import MvpTable
7
- from deltacat.storage.rivulet import Schema
8
- from deltacat.storage.rivulet.reader.data_scan import DataScan
9
- from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
10
- from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
11
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
5
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
6
+ from deltacat.experimental.storage.rivulet import Schema
7
+ from deltacat.experimental.storage.rivulet.reader.data_scan import DataScan
8
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
9
+ DatasetMetastore,
10
+ )
11
+ from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
12
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
13
+ QueryExpression,
14
+ )
12
15
 
13
16
 
14
17
  class DatasetExecutor:
@@ -22,12 +25,10 @@ class DatasetExecutor:
22
25
 
23
26
  def __init__(
24
27
  self,
25
- field_groups: List[FieldGroup],
26
28
  schema: Schema,
27
29
  metastore: DatasetMetastore,
28
30
  ):
29
31
  self.effective_schema: Schema = schema.__deepcopy__()
30
- self.field_groups = field_groups
31
32
  self.output: MvpTable | None = None
32
33
  self._metastore = metastore
33
34
 
@@ -64,18 +65,9 @@ class DatasetExecutor:
64
65
 
65
66
  TODO for now this is doing dumb in-memory implementation and later this is going to be replaced by rust library
66
67
  """
67
- if len(self.field_groups) == 1:
68
- return self._read_as_mvp_table(schema, self.field_groups[0])
69
- else:
70
- ds1 = self._read_as_mvp_table(schema, self.field_groups[0])
71
- ds2 = self._read_as_mvp_table(schema, self.field_groups[1])
72
- merged = MvpTable.merge(ds1, ds2, schema.primary_key.name)
73
- for i in range(2, len(self.field_groups)):
74
- ds_i = self._read_as_mvp_table(schema, self.field_groups[i])
75
- merged = MvpTable.merge(merged, ds_i, schema.primary_key.name)
76
- return merged
68
+ return self._read_as_mvp_table(schema)
77
69
 
78
- def _read_as_mvp_table(self, schema: Schema, field_group: FieldGroup):
70
+ def _read_as_mvp_table(self, schema: Schema):
79
71
  data = list(
80
72
  DataScan(
81
73
  schema, QueryExpression(), DatasetReader(self._metastore)
@@ -0,0 +1,7 @@
1
+ # TODO later on this will be moved to a dedicated package
2
+ from deltacat.experimental.storage.rivulet.feather.file_reader import FeatherFileReader
3
+ from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
4
+ FileReaderRegistrar,
5
+ )
6
+
7
+ FileReaderRegistrar.register_reader("feather", FeatherFileReader)
@@ -5,15 +5,17 @@ from typing import Optional
5
5
  import pyarrow.ipc
6
6
  from pyarrow import RecordBatch, RecordBatchFileReader
7
7
 
8
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
9
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
10
- from deltacat.storage.rivulet.reader.data_reader import (
8
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
9
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
10
+ from deltacat.experimental.storage.rivulet.reader.data_reader import (
11
11
  RowAndKey,
12
12
  FileReader,
13
13
  FILE_FORMAT,
14
14
  )
15
- from deltacat.storage.rivulet.reader.pyarrow_data_reader import RecordBatchRowIndex
16
- from deltacat.storage.rivulet.schema.schema import Schema
15
+ from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
16
+ RecordBatchRowIndex,
17
+ )
18
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
17
19
 
18
20
 
19
21
  class FeatherFileReader(FileReader[RecordBatchRowIndex]):
@@ -3,10 +3,10 @@ from typing import List
3
3
  import pyarrow as pa
4
4
  from pyarrow import feather
5
5
 
6
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
7
- from deltacat.storage.rivulet import Schema
8
- from deltacat.storage.rivulet.arrow.serializer import ArrowSerializer
9
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
6
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
7
+ from deltacat.experimental.storage.rivulet import Schema
8
+ from deltacat.experimental.storage.rivulet.arrow.serializer import ArrowSerializer
9
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
10
10
 
11
11
 
12
12
  class FeatherDataSerializer(ArrowSerializer):
@@ -3,9 +3,9 @@ import time
3
3
  from typing import List, Generator
4
4
 
5
5
  from deltacat.storage.model.partition import PartitionLocator
6
- from deltacat.storage.rivulet.fs.file_store import FileStore
7
- from deltacat.storage.rivulet.fs.input_file import InputFile
8
- from deltacat.storage.rivulet.fs.output_file import OutputFile
6
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
7
+ from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
8
+ from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
9
9
  from deltacat.utils.metafile_locator import _find_partition_path
10
10
 
11
11
 
@@ -4,8 +4,8 @@ from pyarrow.fs import FileSystem, FileType, FileSelector
4
4
  # TODO(deltacat): Rely on deltacat implementation to resolve path and filesystem.
5
5
  from ray.data.datasource.path_util import _resolve_paths_and_filesystem
6
6
 
7
- from deltacat.storage.rivulet.fs.input_file import FSInputFile
8
- from deltacat.storage.rivulet.fs.output_file import FSOutputFile
7
+ from deltacat.experimental.storage.rivulet.fs.input_file import FSInputFile
8
+ from deltacat.experimental.storage.rivulet.fs.output_file import FSOutputFile
9
9
 
10
10
 
11
11
  class FileStore:
@@ -5,7 +5,7 @@ from typing import Protocol
5
5
 
6
6
  from pyarrow.fs import FileSystem, FileType
7
7
 
8
- from deltacat.storage.rivulet.fs.input_file import FSInputFile, InputFile
8
+ from deltacat.experimental.storage.rivulet.fs.input_file import FSInputFile, InputFile
9
9
 
10
10
 
11
11
  class OutputStream(Protocol): # pragma: no cover
@@ -1,9 +1,9 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import List, Callable, Any, Protocol
3
3
 
4
- from deltacat.storage.rivulet.dataset_executor import DatasetExecutor
5
- from deltacat.storage.rivulet.mvp.Table import MvpTable
6
- from deltacat.storage.rivulet import Schema
4
+ from deltacat.experimental.storage.rivulet.dataset_executor import DatasetExecutor
5
+ from deltacat.experimental.storage.rivulet import Schema
6
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
7
7
 
8
8
 
9
9
  class DatasetOperation(Protocol):
@@ -99,7 +99,7 @@ class LogicalPlan:
99
99
  self.operations.append(CollectOperation())
100
100
  return self
101
101
 
102
- def execute(self, executor: DatasetExecutor) -> "MvpTable":
102
+ def execute(self, executor: DatasetExecutor) -> MvpTable:
103
103
  for operation in self.operations:
104
104
  operation.visit(executor)
105
105
  return executor.output
@@ -19,7 +19,7 @@ from deltacat.storage.model.partition import PartitionLocator
19
19
  from deltacat.storage.model.transaction import TransactionOperationList
20
20
 
21
21
  from deltacat.storage.model.types import StreamFormat
22
- from deltacat.storage.rivulet import Schema
22
+ from deltacat.experimental.storage.rivulet import Schema
23
23
 
24
24
  StreamPosition = int
25
25
  """The stream position for creating a consistent ordering of manifests."""
@@ -4,9 +4,9 @@ import json
4
4
  from itertools import zip_longest
5
5
  from typing import List
6
6
 
7
- from deltacat.storage.rivulet.fs.input_file import InputFile
8
- from deltacat.storage.rivulet.fs.output_file import OutputFile
9
- from deltacat.storage.rivulet.metastore.sst import (
7
+ from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
8
+ from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
9
+ from deltacat.experimental.storage.rivulet.metastore.sst import (
10
10
  SSTWriter,
11
11
  SSTableRow,
12
12
  SSTReader,
@@ -1,8 +1,8 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import Protocol, Any, List
3
3
 
4
- from deltacat.storage.rivulet.fs.input_file import InputFile
5
- from deltacat.storage.rivulet.fs.output_file import OutputFile
4
+ from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
5
+ from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
6
6
 
7
7
 
8
8
  @dataclass(frozen=True)
@@ -8,9 +8,9 @@ from typing import Any, Dict, Set, List, FrozenSet, Iterable, TypeVar, NamedTupl
8
8
 
9
9
  from intervaltree import Interval, IntervalTree
10
10
 
11
- from deltacat.storage.rivulet.metastore.delta import DeltaContext
12
- from deltacat.storage.rivulet.metastore.sst import SSTable, SSTableRow
13
- from deltacat.storage.rivulet import Schema
11
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
12
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTable, SSTableRow
13
+ from deltacat.experimental.storage.rivulet import Schema
14
14
 
15
15
  T = TypeVar("T")
16
16
 
@@ -0,0 +1,7 @@
1
+ # TODO later on this will be moved to a dedicated package
2
+ from deltacat.experimental.storage.rivulet.parquet.file_reader import ParquetFileReader
3
+ from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
4
+ FileReaderRegistrar,
5
+ )
6
+
7
+ FileReaderRegistrar.register_reader("parquet", ParquetFileReader)
@@ -4,15 +4,17 @@ from typing import Optional
4
4
 
5
5
  from pyarrow import RecordBatch
6
6
 
7
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
8
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
9
- from deltacat.storage.rivulet.reader.data_reader import (
7
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
8
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
9
+ from deltacat.experimental.storage.rivulet.reader.data_reader import (
10
10
  RowAndKey,
11
11
  FileReader,
12
12
  FILE_FORMAT,
13
13
  )
14
- from deltacat.storage.rivulet.reader.pyarrow_data_reader import RecordBatchRowIndex
15
- from deltacat.storage.rivulet.schema.schema import Schema
14
+ from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
15
+ RecordBatchRowIndex,
16
+ )
17
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
16
18
  import pyarrow.parquet as pq
17
19
  import pyarrow as pa
18
20
 
@@ -3,11 +3,11 @@ from typing import List, Any
3
3
  import pyarrow as pa
4
4
  from pyarrow.parquet import FileMetaData
5
5
 
6
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
7
- from deltacat.storage.rivulet import Schema
8
- from deltacat.storage.rivulet.arrow.serializer import ArrowSerializer
6
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
7
+ from deltacat.experimental.storage.rivulet import Schema
8
+ from deltacat.experimental.storage.rivulet.arrow.serializer import ArrowSerializer
9
9
 
10
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
10
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
11
11
 
12
12
 
13
13
  class ParquetDataSerializer(ArrowSerializer):
@@ -15,19 +15,30 @@ from typing import (
15
15
  AbstractSet,
16
16
  )
17
17
 
18
- from deltacat.storage.rivulet.metastore.delta import DeltaContext
19
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
20
- from deltacat.storage.rivulet.metastore.sst_interval_tree import (
18
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
19
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
20
+ from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
21
21
  OrderedBlockGroups,
22
22
  BlockGroup,
23
23
  Block,
24
24
  )
25
- from deltacat.storage.rivulet.reader.data_reader import RowAndKey, FileReader
26
- from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
27
- from deltacat.storage.rivulet.reader.pyarrow_data_reader import ArrowDataReader
28
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
29
- from deltacat.storage.rivulet.reader.reader_type_registrar import FileReaderRegistrar
30
- from deltacat.storage.rivulet import Schema
25
+ from deltacat.experimental.storage.rivulet.reader.data_reader import (
26
+ RowAndKey,
27
+ FileReader,
28
+ )
29
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
30
+ DatasetMetastore,
31
+ )
32
+ from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
33
+ ArrowDataReader,
34
+ )
35
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
36
+ QueryExpression,
37
+ )
38
+ from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
39
+ FileReaderRegistrar,
40
+ )
41
+ from deltacat.experimental.storage.rivulet import Schema
31
42
  from deltacat import logs
32
43
 
33
44
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))