PyPI - deltacat - Versions diffs - 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl - Mend

deltacat 2.0.0b9py3-none-any.whl → 2.0.0b11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py RENAMED Viewed

@@ -4,9 +4,7 @@ import deltacat as dc
 from deltacat import logs
 from deltacat import IcebergCatalog
-from deltacat.examples.common.fixtures import (
-    store_cli_args_in_os_environ,
-)
+from env import store_cli_args_in_os_environ
 from pyiceberg.schema import (
     Schema,
@@ -22,7 +20,7 @@ from pyiceberg.transforms import DayTransform, IdentityTransform
 from pyiceberg.table.sorting import SortField, SortOrder
 from deltacat.exceptions import TableAlreadyExistsError
-from deltacat.storage.iceberg.model import (
+from deltacat.experimental.storage.iceberg.model import (
     SchemaMapper,
     PartitionSchemeMapper,
     SortSchemeMapper,

deltacat/examples/hello_world.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import ray
 import deltacat
 import daft
-import pyiceberg
 def print_package_version_info():
     print(f"DeltaCAT Version: {deltacat.__version__}")
-    print(f"PyIceberg Version: {pyiceberg.__version__}")
     print(f"Ray Version: {ray.__version__}")
     print(f"Daft Version: {daft.__version__}")
@@ -24,4 +22,8 @@ def run():
 if __name__ == "__main__":
+    # initialize deltacat
+    deltacat.init()
+    # run the example
     run()

deltacat/examples/indexer/indexer.py ADDED Viewed

@@ -0,0 +1,163 @@
+import argparse
+from datetime import datetime
+import ray
+import deltacat
+import daft
+import pyarrow as pa
+import pandas as pd
+import polars as pl
+import numpy as np
+from deltacat import DeltaCatUrl
+def print_package_version_info() -> None:
+    print(f"DeltaCAT Version: {deltacat.__version__}")
+    print(f"Ray Version: {ray.__version__}")
+    print(f"Daft Version: {daft.__version__}")
+    print(f"NumPy Version: {np.__version__}")
+    print(f"PyArrow Version: {pa.__version__}")
+    print(f"Polars Version: {pl.__version__}")
+    print(f"Pandas Version: {pd.__version__}")
+def json_path_to_regex(path: str):
+    if not path:
+        raise ValueError("Path cannot be empty")
+    parts = path.split("/")
+    leaf_key = parts.pop()
+    regex = r""
+    for part in parts:
+        if part.strip():  # discard leading and/or redundant separators
+            regex += rf'"{part}"\s*:\s*[{{\[].*?'
+    regex += rf'"{leaf_key}"\s*:\s*"(?<{leaf_key}>.*?)"'
+    return regex
+def run(
+    source: str,
+    dest: str,
+) -> None:
+    # print package version info
+    print_package_version_info()
+    # run a synchronous copy from the source to the destination
+    deltacat.copy(
+        DeltaCatUrl(source),
+        DeltaCatUrl(dest),
+        # reader arguments to pass to the default reader (polars)
+        # for the given text-based datasource, it accepts the same
+        # arguments as polars.read_csv except for `source`, `n_threads`
+        # `new_columns`, `separator`, `has_header`, `quote_char`, and
+        # `infer_schema`.
+        reader_args={
+            "low_memory": True,  # try to use less memory (++stability, --perf)
+            "batch_size": 1024,  # text line count read into a buffer at once
+            "use_pyarrow": True,  # use the native pyarrow reader
+        },
+        # writer arguments to pass to the default writer (polars)
+        # for the given parquet-based datasink, it generally accepts the same
+        # arguments as polars.DataFrame.write_{dest-type} except for `file`
+        writer_args={
+            "compression": "lz4",  # faster compression & decompression
+            # "compression": "zstd",  # better compression ratio
+            # "compression": "snappy",  # compatible w/ older Parquet readers
+        },
+        # Transforms to run against the default polars dataframe read.
+        # By default, each transform takes a polars dataframe `df` as input
+        # and produces a polars dataframe as output. All transforms listed
+        # are run in order (i.e., the dataframe output from transform[0]
+        # is the dataframe input to transform[1]).
+        #
+        # See:
+        # https://docs.pola.rs/api/python/stable/reference/dataframe/index.html
+        # https://docs.pola.rs/api/python/stable/reference/expressions/index.html
+        transforms=[
+            lambda df, src: df.rename(
+                {"text": "utf8_body"},
+            ),
+            lambda df, src: df.with_columns(
+                pl.col("utf8_body").hash().alias("utf8_body_hash"),
+                pl.lit(datetime.utcnow()).dt.datetime().alias("processing_time"),
+                pl.lit(src.url_path).alias("source_file_path"),
+            ),
+        ],
+    )
+if __name__ == "__main__":
+    """
+    Example 1: Run this script locally using Ray:
+    $ python indexer.py \
+    $   --source 'text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31' \
+    $   --dest 'parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet'
+    Example 2: Submit this script as a local Ray job using a local job client:
+    >>> from deltacat import local_job_client
+    >>> client = local_job_client()
+    >>> # read the source file as line-delimited text
+    >>> src = "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31"
+    >>> # write to the destination file using the default DeltaCAT Parquet writer (i.e., polars.DataFrame.write_parquet)
+    >>> dst = "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet"
+    >>> try:
+    >>>   job_run_result = client.run_job(
+    >>>       # Entrypoint shell command to run the indexer job
+    >>>       entrypoint=f"python indexer.py --source '{src}' --dest '{dst}'",
+    >>>       # Path to the local directory that contains the indexer.py file
+    >>>       runtime_env={"working_dir": "./deltacat/examples/indexer.py"},
+    >>>   )
+    >>>   print(f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}")
+    >>>   print(f"Job ID {job_run_result.job_id} logs: ")
+    >>>   print(job_run_result.job_logs)
+    >>> except RuntimeError as e:
+    >>>     print(f"Job Run Failed: {e}")
+    >>> except TimeoutError as e:
+    >>>     print(f"Job Run Timed Out: {e}")
+    Example 3: Submit this script as a remote Ray job using a remote job client:
+    >>> from deltacat import job_client
+    >>> # use `deltacat.yaml` from the current working directory as the ray cluster launcher config file
+    >>> # automatically launches the cluster if it doesn't exist or has died
+    >>> # automatically forwards the ray cluster's dashboard for viewing in a web browser @ http://localhost:8265
+    >>> client = job_client()
+    >>> # ... follow the same steps as above to submit a synchronous indexer job ...
+    >>>
+    >>> # OR use an explicit cluster launcher config file path
+    >>> client = job_client("/Users/pdames/workspace/deltacat.yaml")
+    >>> # ... follow the same steps as above to submit a synchronous indexer job ...
+    """
+    script_args = [
+        (
+            [
+                "--source",
+            ],
+            {
+                "help": "Source DeltaCAT URL to index.",
+                "type": str,
+            },
+        ),
+        (
+            [
+                "--dest",
+            ],
+            {
+                "help": "Destination DeltaCAT URL to index.",
+                "type": str,
+            },
+        ),
+    ]
+    # parse CLI input arguments
+    parser = argparse.ArgumentParser()
+    for args, kwargs in script_args:
+        parser.add_argument(*args, **kwargs)
+    args = parser.parse_args()
+    print(f"Command Line Arguments: {args}")
+    # initialize deltacat
+    deltacat.init()
+    # run the example using the parsed arguments
+    run(**vars(args))

deltacat/examples/indexer/job_runner.py ADDED Viewed

@@ -0,0 +1,198 @@
+import argparse
+import pathlib
+from deltacat.compute import (
+    job_client,
+    JobStatus,
+)
+def run_async(
+    source: str,
+    dest: str,
+    jobs_to_submit: int,
+    job_timeout: int,
+    cloud: str,
+    restart_ray: bool,
+):
+    # print package version info
+    working_dir = pathlib.Path(__file__).parent
+    cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
+    job_number = 0
+    client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
+    job_ids = []
+    while jobs_to_submit > 0:
+        jobs_to_submit -= 1
+        job_dest = dest + f".{job_number}"
+        job_id = client.submit_job(
+            # Entrypoint shell command to execute
+            entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
+            # Path to the local directory that contains the indexer.py file
+            # This entire directory will be zipped into a job package, so keep
+            # it small.
+            runtime_env={"working_dir": working_dir},
+        )
+        job_ids.append(job_id)
+        job_number += 1
+    print("Waiting for all jobs to complete...")
+    job_number = 0
+    all_job_logs = ""
+    for job_id in job_ids:
+        job_status = client.await_job(job_id, timeout_seconds=job_timeout)
+        if job_status != JobStatus.SUCCEEDED:
+            print(f"Job `{job_id}` logs: ")
+            print(client.get_job_logs(job_id))
+            raise RuntimeError(f"Job `{job_id}` terminated with status: {job_status}")
+        all_job_logs += f"\nJob #{job_number} logs: \n"
+        all_job_logs += client.get_job_logs(job_id)
+        job_number += 1
+    print("All jobs completed!")
+    print("Job Logs: ")
+    print(all_job_logs)
+def run_sync(
+    source: str,
+    dest: str,
+    jobs_to_submit: int,
+    job_timeout: int,
+    cloud: str,
+    restart_ray: bool,
+):
+    working_dir = pathlib.Path(__file__).parent
+    cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
+    client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
+    job_number = 0
+    while job_number < jobs_to_submit:
+        job_dest = dest + f".{job_number}"
+        job_run_result = client.run_job(
+            # Entrypoint shell command to execute
+            entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
+            # Path to the local directory that contains the indexer.py file
+            # This entire directory will be zipped into a job package, so keep
+            # it small.
+            runtime_env={"working_dir": working_dir},
+            timeout_seconds=job_timeout,
+        )
+        print(
+            f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}"
+        )
+        print(f"Job ID {job_run_result.job_id} logs: ")
+        print(job_run_result.job_logs)
+        job_number += 1
+def run(
+    source: str,
+    dest: str,
+    restart_ray: bool,
+    jobs_to_submit: int,
+    job_timeout: int,
+    asynchronous: bool,
+    cloud_provider: str,
+):
+    run_func = run_async if asynchronous else run_sync
+    run_func(
+        source=source,
+        dest=dest,
+        jobs_to_submit=jobs_to_submit,
+        job_timeout=job_timeout,
+        cloud=cloud_provider,
+        restart_ray=restart_ray,
+    )
+if __name__ == "__main__":
+    """
+    # Run this example through a command of the form:
+    $ python ./deltacat/examples/job_runner.py -- \
+    $ --source text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31 \
+    $ --dest parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet \
+    $ --asynchronous \
+    $ --jobs-to-submit 100 \
+    $ --job-timeout 90 \
+    $ --cloud-provider aws
+    """
+    script_args = [
+        (
+            [
+                "--source",
+            ],
+            {
+                "help": "Source DeltaCAT URL to index.",
+                "type": str,
+                "default": "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31",
+            },
+        ),
+        (
+            [
+                "--dest",
+            ],
+            {
+                "help": "Destination DeltaCAT URL to store the indexed file.",
+                "type": str,
+                "default": "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet",
+            },
+        ),
+        (
+            [
+                "--restart-ray",
+            ],
+            {
+                "help": "Restart Ray on an existing cluster.",
+                "action": "store_true",
+                "default": False,
+            },
+        ),
+        (
+            [
+                "--asynchronous",
+            ],
+            {
+                "help": "Run jobs asynchronously.",
+                "action": "store_true",
+                "default": False,
+            },
+        ),
+        (
+            [
+                "--jobs-to-submit",
+            ],
+            {
+                "help": "Number of indexer jobs to submit for execution.",
+                "type": int,
+                "default": 1,
+            },
+        ),
+        (
+            [
+                "--job-timeout",
+            ],
+            {
+                "help": "Job timeout in seconds.",
+                "type": int,
+                "default": 300,
+            },
+        ),
+        (
+            [
+                "--cloud-provider",
+            ],
+            {
+                "help": "Ray Cluster Cloud Provider ('aws' or 'gcp')",
+                "type": str,
+                "default": "aws",
+            },
+        ),
+    ]
+    # parse CLI input arguments
+    parser = argparse.ArgumentParser()
+    for args, kwargs in script_args:
+        parser.add_argument(*args, **kwargs)
+    args = parser.parse_args()
+    print(f"Command Line Arguments: {args}")
+    # run the example using os.environ as kwargs
+    run(**vars(args))

deltacat/experimental/catalog/iceberg/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
+    IcebergCatalogConfig,
+)
+import deltacat.experimental.catalog.iceberg.impl as IcebergCatalog
+__all__ = ["IcebergCatalogConfig", "IcebergCatalog"]

deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py RENAMED Viewed

@@ -15,7 +15,7 @@ class IcebergCatalogConfig:
     This configuration is passed through to PyIceberg by invoking load_catalog.
     The Properties provided must match properties accepted by PyIceberg for each catalog type
-    See: :func:`deltacat.catalog.iceberg.initialize`
+    See: :func:`deltacat.experimental.catalog.iceberg.initialize`
     Attributes:
         type: The PyIceberg Catalog instance

deltacat/{catalog → experimental/catalog}/iceberg/impl.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import logging
+import sys
 from typing import Any, Dict, List, Optional, Union
@@ -7,13 +8,19 @@ from daft.daft import ScanOperatorHandle, StorageConfig
 from daft.logical.builder import LogicalPlanBuilder
 from deltacat import logs
+from deltacat.catalog.model.catalog import Catalog
 from deltacat.catalog.model.table_definition import TableDefinition
-from deltacat.daft.daft_scan import DeltaCatScanOperator
+from deltacat.utils.daft import DeltaCatScanOperator
 from deltacat.exceptions import TableAlreadyExistsError
-from deltacat.storage.iceberg.iceberg_scan_planner import IcebergScanPlanner
-from deltacat.storage.iceberg.model import PartitionSchemeMapper, SchemaMapper
+from deltacat.experimental.storage.iceberg.iceberg_scan_planner import (
+    IcebergScanPlanner,
+)
+from deltacat.experimental.storage.iceberg.model import (
+    PartitionSchemeMapper,
+    SchemaMapper,
+)
 from deltacat.storage.model.partition import PartitionScheme
-from deltacat.storage.iceberg.impl import _get_native_catalog
+from deltacat.experimental.storage.iceberg.impl import _get_native_catalog
 from deltacat.storage.model.sort_key import SortScheme
 from deltacat.storage.model.list_result import ListResult
 from deltacat.storage.model.namespace import Namespace, NamespaceProperties
@@ -26,20 +33,31 @@ from deltacat.storage.model.types import (
     LocalTable,
     StreamFormat,
 )
-from deltacat.storage.iceberg import impl as IcebergStorage
+from deltacat.experimental.storage.iceberg import impl as IcebergStorage
 from deltacat.types.media import ContentType
 from deltacat.types.tables import TableWriteMode
 from deltacat.constants import DEFAULT_NAMESPACE
-from deltacat.catalog.iceberg.iceberg_catalog_config import IcebergCatalogConfig
+from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
+    IcebergCatalogConfig,
+)
-from pyiceberg.catalog import Catalog, load_catalog
+from pyiceberg.catalog import Catalog as PyIcebergCatalog, load_catalog
 from pyiceberg.transforms import BucketTransform
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+IcebergCatalog = sys.modules[__name__]
+def from_config(config: IcebergCatalogConfig, *args, **kwargs) -> Catalog:
+    """
+    Factory method to construct a catalog from Iceberg catalog configuration.
+    """
+    return Catalog(config, impl=IcebergCatalog, *args, **kwargs)
 # catalog functions
-def initialize(*args, config: IcebergCatalogConfig, **kwargs) -> Catalog:
+def initialize(config: IcebergCatalogConfig, **kwargs) -> PyIcebergCatalog:
     """
     Initializes an Iceberg catalog with the given config.
@@ -123,7 +141,7 @@ def write_to_table(
                     )
                     # TODO(pdames): only append s3:// to output file paths when writing to S3!
                     out_file_paths = [f"s3://{val}" for val in out_df.to_arrow()[0]]
-                    from deltacat.catalog.iceberg import overrides
+                    from deltacat.experimental.catalog.iceberg import overrides
                     overrides.append(
                         table_definition.table.native_object,

deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py RENAMED Viewed

@@ -5,7 +5,7 @@ from deltacat.storage.model.scan.push_down import Pushdown
 from deltacat.storage.model.scan.scan_plan import ScanPlan
 from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
 from deltacat.storage.util.scan_planner import ScanPlanner
-from deltacat.storage.iceberg.impl import _try_load_iceberg_table
+from deltacat.experimental.storage.iceberg.impl import _try_load_iceberg_table
 class IcebergScanPlanner(ScanPlanner):

deltacat/{storage → experimental/storage}/iceberg/impl.py RENAMED Viewed

@@ -32,7 +32,7 @@ from deltacat.storage import (
     NamespaceProperties,
 )
 from deltacat.storage.model.manifest import Manifest
-from deltacat.storage.iceberg.model import (
+from deltacat.experimental.storage.iceberg.model import (
     SchemaMapper,
     PartitionSchemeMapper,
     SortSchemeMapper,

deltacat/experimental/storage/rivulet/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from deltacat.experimental.storage.rivulet.schema.schema import Schema
+from deltacat.experimental.storage.rivulet.schema.schema import Field
+from deltacat.experimental.storage.rivulet.dataset import Dataset
+from deltacat.experimental.storage.rivulet.schema.schema import Datatype
+__all__ = [
+    "Schema",
+    "Field",
+    "Dataset",
+    "Datatype",
+]

deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py RENAMED Viewed

@@ -2,10 +2,13 @@ from abc import ABC, abstractmethod
 from typing import Iterator, List, Any
 import pyarrow as pa
-from deltacat.storage.rivulet.metastore.sst import SSTableRow
-from deltacat.storage.rivulet import Schema
-from deltacat.storage.rivulet.serializer import DataSerializer, MEMTABLE_DATA
-from deltacat.storage.rivulet.fs.file_provider import FileProvider
+from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
+from deltacat.experimental.storage.rivulet import Schema
+from deltacat.experimental.storage.rivulet.serializer import (
+    DataSerializer,
+    MEMTABLE_DATA,
+)
+from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
 class ArrowSerializer(DataSerializer, ABC):

deltacat/{storage → experimental/storage}/rivulet/dataset.py RENAMED Viewed

@@ -24,19 +24,23 @@ from deltacat.storage.model.shard import Shard, ShardingStrategy
 from deltacat.storage.model.stream import Stream, StreamLocator
 from deltacat.storage.model.transaction import TransactionOperationList
 from deltacat.storage.model.types import CommitState, StreamFormat
-from deltacat.storage.rivulet.fs.file_store import FileStore
-from deltacat.storage.rivulet.fs.file_provider import FileProvider
-from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
-from deltacat.storage.rivulet import Schema, Field
+from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
+from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
+from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
+    DatasetMetastore,
+)
+from deltacat.experimental.storage.rivulet import Schema, Field
 from deltacat.utils.export import export_dataset
 from .schema.schema import Datatype
-from deltacat.storage.rivulet.reader.data_scan import DataScan
-from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
-from deltacat.storage.rivulet.reader.query_expression import QueryExpression
+from deltacat.experimental.storage.rivulet.reader.data_scan import DataScan
+from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
+from deltacat.experimental.storage.rivulet.reader.query_expression import (
+    QueryExpression,
+)
-from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter
-from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
+from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
+from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
     MemtableDatasetWriter,
 )

deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py RENAMED Viewed

@@ -2,13 +2,16 @@ from __future__ import annotations
 from typing import List, Callable, Any
-from deltacat.storage.rivulet.field_group import FieldGroup
-from deltacat.storage.rivulet.mvp.Table import MvpTable
-from deltacat.storage.rivulet import Schema
-from deltacat.storage.rivulet.reader.data_scan import DataScan
-from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
-from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
-from deltacat.storage.rivulet.reader.query_expression import QueryExpression
+from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
+from deltacat.experimental.storage.rivulet import Schema
+from deltacat.experimental.storage.rivulet.reader.data_scan import DataScan
+from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
+    DatasetMetastore,
+)
+from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
+from deltacat.experimental.storage.rivulet.reader.query_expression import (
+    QueryExpression,
+)
 class DatasetExecutor:
@@ -22,12 +25,10 @@ class DatasetExecutor:
     def __init__(
         self,
-        field_groups: List[FieldGroup],
         schema: Schema,
         metastore: DatasetMetastore,
     ):
         self.effective_schema: Schema = schema.__deepcopy__()
-        self.field_groups = field_groups
         self.output: MvpTable | None = None
         self._metastore = metastore
@@ -64,18 +65,9 @@ class DatasetExecutor:
         TODO for now this is doing dumb in-memory implementation and later this is going to be replaced by rust library
         """
-        if len(self.field_groups) == 1:
-            return self._read_as_mvp_table(schema, self.field_groups[0])
-        else:
-            ds1 = self._read_as_mvp_table(schema, self.field_groups[0])
-            ds2 = self._read_as_mvp_table(schema, self.field_groups[1])
-            merged = MvpTable.merge(ds1, ds2, schema.primary_key.name)
-            for i in range(2, len(self.field_groups)):
-                ds_i = self._read_as_mvp_table(schema, self.field_groups[i])
-                merged = MvpTable.merge(merged, ds_i, schema.primary_key.name)
-            return merged
+        return self._read_as_mvp_table(schema)
-    def _read_as_mvp_table(self, schema: Schema, field_group: FieldGroup):
+    def _read_as_mvp_table(self, schema: Schema):
         data = list(
             DataScan(
                 schema, QueryExpression(), DatasetReader(self._metastore)

deltacat/experimental/storage/rivulet/feather/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# TODO later on this will be moved to a dedicated package
+from deltacat.experimental.storage.rivulet.feather.file_reader import FeatherFileReader
+from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
+    FileReaderRegistrar,
+)
+FileReaderRegistrar.register_reader("feather", FeatherFileReader)

deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py RENAMED Viewed

@@ -5,15 +5,17 @@ from typing import Optional
 import pyarrow.ipc
 from pyarrow import RecordBatch, RecordBatchFileReader
-from deltacat.storage.rivulet.fs.file_provider import FileProvider
-from deltacat.storage.rivulet.metastore.sst import SSTableRow
-from deltacat.storage.rivulet.reader.data_reader import (
+from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
+from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
+from deltacat.experimental.storage.rivulet.reader.data_reader import (
     RowAndKey,
     FileReader,
     FILE_FORMAT,
 )
-from deltacat.storage.rivulet.reader.pyarrow_data_reader import RecordBatchRowIndex
-from deltacat.storage.rivulet.schema.schema import Schema
+from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
+    RecordBatchRowIndex,
+)
+from deltacat.experimental.storage.rivulet.schema.schema import Schema
 class FeatherFileReader(FileReader[RecordBatchRowIndex]):

deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

deltacat 2.0.0b9py3-none-any.whl → 2.0.0b11py3-none-any.whl