PyPI - deltacat - Versions diffs - 2.0.0.post1__py3-none-any.whl → 2.0.0.post2__py3-none-any.whl - Mend

deltacat 2.0.0.post1py3-none-any.whl → 2.0.0.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

deltacat/__init__.py +1 -1
deltacat/api.py +44 -7
deltacat/catalog/main/impl.py +34 -110
deltacat/examples/hello_world.py +10 -4
deltacat/examples/indexer/indexer.py +3 -0
deltacat/examples/indexer/job_runner.py +6 -1
deltacat/storage/model/schema.py +17 -4
deltacat/tests/aws/test_s3u.py +9 -1
deltacat/tests/catalog/test_default_catalog_impl.py +198 -7
deltacat/types/media.py +282 -0
deltacat/types/tables.py +5 -11
deltacat/utils/pandas.py +11 -3
deltacat/utils/polars.py +3 -1
deltacat/utils/pyarrow.py +7 -3
deltacat/utils/url.py +22 -0
{deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/METADATA +161 -47
{deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/RECORD +20 -20
{deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/WHEEL +0 -0
{deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/licenses/LICENSE +0 -0
{deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/top_level.txt +0 -0

deltacat/__init__.py CHANGED Viewed

@@ -122,7 +122,7 @@ if importlib.util.find_spec("pyiceberg") is not None:
 deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
-__version__ = "2.0.0.post1"
+__version__ = "2.0.0.post2"
 __all__ = [

deltacat/api.py CHANGED Viewed

@@ -28,7 +28,10 @@ from deltacat.storage import (
     LocalTable,
     Metafile,
 )
-from deltacat.types.media import DatasetType
+from deltacat.types.media import (
+    DatasetType,
+    DatastoreType,
+)
 from deltacat.utils.url import (
     DeltaCatUrl,
     DeltaCatUrlReader,
@@ -83,8 +86,8 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 def copy(
-    src: DeltaCatUrl,
-    dst: DeltaCatUrl,
+    src: Union[DeltaCatUrl, str],
+    dst: Union[DeltaCatUrl, str],
     *,
     transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
     extension_to_memory_multiplier: Dict[str, float] = {
@@ -153,6 +156,8 @@ def copy(
     Returns:
         None
     """
+    src = _resolve_url(src)
+    dst = _resolve_url(dst)
     if src.is_deltacat_catalog_url() or dst.is_deltacat_catalog_url():
         return _copy_dc(src, dst, recursive=src.url.endswith("/**"))
     else:
@@ -305,12 +310,13 @@ class CustomReadKwargsProvider(ReadKwargsProvider):
 def list(
-    url: DeltaCatUrl,
+    url: Union[DeltaCatUrl, str],
     *,
     recursive: bool = False,
     dataset_type: Optional[DatasetType] = None,
     **kwargs,
 ) -> Union[List[Metafile], LocalTable, DistributedDataset]:
+    url = _resolve_url(url)
     if not url.is_deltacat_catalog_url():
         raise NotImplementedError("List only supports DeltaCAT Catalog URLs.")
     if dataset_type in DatasetType.distributed():
@@ -345,21 +351,52 @@ def list(
         )
+def _resolve_url(url: Union[DeltaCatUrl, str]) -> DeltaCatUrl:
+    if isinstance(url, str):
+        try:
+            url = DeltaCatUrl(url)
+        except ValueError:
+            url = DatastoreType.get_url(url)
+            url = DeltaCatUrl(url)
+    return url
 def get(
-    url,
+    url: Union[DeltaCatUrl, str],
+    read_as: DatasetType = DatasetType.RAY_DATASET,
     *args,
     **kwargs,
 ) -> Union[Metafile, Dataset]:
-    reader = DeltaCatUrlReader(url)
+    """
+    Reads a DeltaCAT URL into a Metafile or Dataset. DeltaCAT URLs can either
+    reference objects registered in a DeltaCAT catalog, or unregistered external
+    objects that are readable into a Dataset. DeltaCAT automatically infers the right
+    Ray Data reader for the URL. If the URL is an unregistered external object,
+    the reader will be inferred from the URL's datastore type.
+    Args:
+        url: The DeltaCAT URL to read.
+        read_as: The DatasetType to read an unregistered external object as. Ignored for
+            registered DeltaCAT objects. Defaults to DatasetType.RAY_DATASET.
+        args: Additional arguments to pass to the reader.
+        kwargs: Additional keyword arguments to pass to the reader.
+    Returns:
+        A Metafile for registered DeltaCAT URLs or a Dataset containing the
+        data from the URL.
+    """
+    url = _resolve_url(url)
+    reader = DeltaCatUrlReader(url, dataset_type=read_as)
     return reader.read(*args, **kwargs)
 def put(
-    url: DeltaCatUrl,
+    url: Union[DeltaCatUrl, str],
     metafile: Optional[Metafile] = None,
     *args,
     **kwargs,
 ) -> Union[Metafile, str]:
+    url = _resolve_url(url)
     writer = DeltaCatUrlWriter(url, metafile=metafile)
     return writer.write(*args, **kwargs)

deltacat/catalog/main/impl.py CHANGED Viewed

@@ -446,7 +446,9 @@ def write_to_table(
                 "transaction": write_transaction,  # Pass transaction to update_table_version
             }
-            _get_storage(**catalog_kwargs).update_table_version(
+            _, updated_table_version_obj, _ = _get_storage(
+                **catalog_kwargs
+            ).update_table_version(
                 namespace=namespace,
                 table_name=table,
                 table_version=table_version_obj.table_version,
@@ -465,9 +467,9 @@ def write_to_table(
             content_type,
             commit_staged_partition,
             table_version_obj,
+            updated_table_version_obj if schema_modified else None,
             namespace,
             table,
-            schema=updated_schema if schema_modified else table_version_obj.schema,
             original_fields=original_fields,
             **filtered_kwargs,
         )
@@ -743,61 +745,6 @@ def _convert_numpy_for_schema_validation(
         )
-def _build_entry_index_to_schema_mapping(
-    qualified_deltas: List[Delta], table_version_obj, **kwargs
-) -> List[Schema]:
-    """Build a mapping from manifest entry index to schema for reading operations.
-    Args:
-        qualified_deltas: List of deltas to process
-        table_version_obj: Table version containing schemas
-        **kwargs: Additional arguments passed to storage operations
-    Returns:
-        List mapping each manifest entry index to its corresponding schema
-    Raises:
-        ValueError: If a manifest's schema ID is not found in table version schemas
-    """
-    entry_index_to_schema = []
-    for delta in qualified_deltas:
-        if delta.manifest:
-            manifest = delta.manifest
-        else:
-            # Fetch manifest from storage
-            manifest = _get_storage(**kwargs).get_delta_manifest(
-                delta.locator,
-                **kwargs,
-            )
-        # Map manifest entry index to schema ID
-        schema_id = manifest.meta.schema_id
-        # Find the schema that matches this manifest's schema_id
-        matching_schema = None
-        if table_version_obj.schemas:
-            for schema in table_version_obj.schemas:
-                if schema.id == schema_id:
-                    matching_schema = schema
-                    break
-        if matching_schema is None:
-            available_schema_ids = (
-                [s.id for s in table_version_obj.schemas]
-                if table_version_obj.schemas
-                else []
-            )
-            raise ValueError(
-                f"Manifest schema ID {schema_id} not found in table version schemas. "
-                f"Available schema IDs: {available_schema_ids}. "
-            )
-        # Add the matching schema for each entry in this manifest
-        for _ in range(len(manifest.entries)):
-            entry_index_to_schema.append(matching_schema)
-    return entry_index_to_schema
 def _convert_data_if_needed(data: Dataset) -> Dataset:
     """Convert unsupported data types to supported ones."""
     if isinstance(data, daft.DataFrame):
@@ -950,10 +897,10 @@ def _stage_commit_and_compact(
     delta_type: DeltaType,
     content_type: ContentType,
     commit_staged_partition: bool,
-    table_version_obj: TableVersion,
+    original_table_version_obj: TableVersion,
+    updated_table_version_obj: Optional[TableVersion],
     namespace: str,
     table: str,
-    schema: Schema,
     original_fields: Set[str],
     **kwargs,
 ) -> None:
@@ -962,6 +909,12 @@ def _stage_commit_and_compact(
     # We explicitly pass the correct schema parameter
     kwargs.pop("schema", None)
+    resolved_table_version_obj = (
+        updated_table_version_obj
+        if updated_table_version_obj
+        else original_table_version_obj
+    )
     # Stage a delta with the data
     delta = _get_storage(**kwargs).stage_delta(
         data=converted_data,
@@ -971,7 +924,7 @@ def _stage_commit_and_compact(
         author=ManifestAuthor.of(
             name="deltacat.write_to_table", version=dc.__version__
         ),
-        schema=schema,
+        schema=resolved_table_version_obj.schema,
         **kwargs,
     )
@@ -982,25 +935,26 @@ def _stage_commit_and_compact(
     # Check compaction trigger decision
     should_compact = _trigger_compaction(
-        table_version_obj,
+        resolved_table_version_obj,
         delta,
         TableReadOptimizationLevel.MAX,
         **kwargs,
     )
     if should_compact:
         # Run V2 compaction session to merge or delete data
-        if table_version_obj.schema:
-            all_column_names = table_version_obj.schema.arrow.names
-        else:
+        if not original_table_version_obj.schema:
             raise RuntimeError("Table version schema is required to run compaction.")
+        original_table_version_column_names = (
+            original_table_version_obj.schema.arrow.names
+        )
         _run_compaction_session(
-            table_version_obj=table_version_obj,
+            table_version_obj=resolved_table_version_obj,
             partition=partition,
             latest_delta_stream_position=delta.stream_position,
             namespace=namespace,
             table=table,
             original_fields=original_fields,
-            all_column_names=all_column_names,
+            original_table_version_column_names=original_table_version_column_names,
             **kwargs,
         )
@@ -1232,7 +1186,7 @@ def _run_compaction_session(
     namespace: str,
     table: str,
     original_fields: Set[str],
-    all_column_names: List[str],
+    original_table_version_column_names: List[str],
     **kwargs,
 ) -> None:
     """
@@ -1254,7 +1208,8 @@ def _run_compaction_session(
         # Extract compaction configuration
         primary_keys = _get_compaction_primary_keys(table_version_obj)
         hash_bucket_count = _get_compaction_hash_bucket_count(
-            partition, table_version_obj
+            partition,
+            table_version_obj,
         )
         # Create compaction parameters
@@ -1265,7 +1220,7 @@ def _run_compaction_session(
             primary_keys,
             hash_bucket_count,
             original_fields=original_fields,
-            all_column_names=all_column_names,
+            all_column_names=original_table_version_column_names,
             **kwargs,
         )
@@ -1499,10 +1454,6 @@ def _download_and_process_table_data(
             return _convert_pandas_to_numpy(result)
         return result
-    # Get schemas for each manifest entry
-    entry_index_to_schema = _build_entry_index_to_schema_mapping(
-        qualified_deltas, table_version_obj, **kwargs
-    )
     # Standard non-empty schema table read path - merge deltas and download data
     merged_delta = Delta.merge_deltas(qualified_deltas)
@@ -1570,11 +1521,10 @@ def _download_and_process_table_data(
                 result,
                 table_type,
                 table_version_obj.schema,
-                entry_index_to_schema,
                 file_path_column,
                 columns,
             )
-    # Convert to numpy if original request was for numpy
+    # Convert pandas to numpy if original request was for numpy
     if original_read_as == DatasetType.NUMPY:
         return _convert_pandas_to_numpy(result)
@@ -1589,22 +1539,25 @@ def _convert_pandas_to_numpy(dataset: Dataset):
 def _coerce_dataset_to_schema(
-    dataset: Dataset, target_schema: pa.Schema, manifest_entry_schema: Schema
+    dataset: Dataset,
+    target_schema: pa.Schema,
 ) -> Dataset:
     """Coerce a dataset to match the target PyArrow schema using DeltaCAT Schema.coerce method."""
     # Convert target PyArrow schema to DeltaCAT schema and use its coerce method
     deltacat_schema = Schema.of(schema=target_schema)
-    return deltacat_schema.coerce(dataset, manifest_entry_schema)
+    return deltacat_schema.coerce(dataset)
 def _coerce_results_to_schema(
-    results: Dataset, target_schema: pa.Schema, entry_index_to_schema: List[Schema]
+    results: Dataset,
+    target_schema: pa.Schema,
 ) -> List[Dataset]:
     """Coerce all table results to match the target schema."""
     coerced_results = []
     for i, table_result in enumerate(results):
         coerced_result = _coerce_dataset_to_schema(
-            table_result, target_schema, entry_index_to_schema[i]
+            table_result,
+            target_schema,
         )
         coerced_results.append(coerced_result)
         logger.debug(f"Coerced table {i} to unified schema")
@@ -1631,35 +1584,10 @@ def _create_target_schema(
     return arrow_schema
-def _create_entry_schemas_for_concatenation(
-    entry_index_to_schema: List[Schema],
-    columns: Optional[List[str]] = None,
-    file_path_column: Optional[str] = None,
-) -> List[Schema]:
-    """Create entry schemas for concatenation, optionally filtered by column selection."""
-    if columns is None:
-        # No column selection - return original schemas as-is
-        return entry_index_to_schema
-    # Column selection - filter each entry schema
-    modified_schemas = []
-    for entry_schema in entry_index_to_schema:
-        if entry_schema and entry_schema.arrow:
-            filtered_schema = _create_target_schema(
-                entry_schema.arrow, columns, file_path_column
-            )
-            modified_schemas.append(Schema.of(schema=filtered_schema))
-        else:
-            modified_schemas.append(entry_schema)
-    return modified_schemas
 def _handle_local_table_concatenation(
     results: Dataset,
     table_type: DatasetType,
     table_schema: Optional[Schema],
-    entry_index_to_schema: List[Schema],
     file_path_column: Optional[str] = None,
     columns: Optional[List[str]] = None,
 ) -> Dataset:
@@ -1670,14 +1598,10 @@ def _handle_local_table_concatenation(
     target_schema = _create_target_schema(table_schema.arrow, columns, file_path_column)
     logger.debug(f"Created target schema: {target_schema.names}")
-    # Filter entry schemas to match column selection and file_path_column
-    modified_entry_schemas = _create_entry_schemas_for_concatenation(
-        entry_index_to_schema, columns, file_path_column
-    )
     # Coerce results to unified schema
     coerced_results = _coerce_results_to_schema(
-        results, target_schema, modified_entry_schemas
+        results,
+        target_schema,
     )
     # Second step: concatenate the coerced results

deltacat/examples/hello_world.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import ray
-import deltacat
+import deltacat as dc
 import daft
 def print_package_version_info():
-    print(f"DeltaCAT Version: {deltacat.__version__}")
+    print(f"DeltaCAT Version: {dc.__version__}")
     print(f"Ray Version: {ray.__version__}")
     print(f"Daft Version: {daft.__version__}")
@@ -12,18 +12,24 @@ def print_package_version_info():
 @ray.remote
 def hello_worker():
     print("Hello, Worker!")
+    df = daft.from_pydict({"hello": ["delta", "cat"]})
+    dc.write(df, "hello_world")
     print_package_version_info()
 def run():
     print("Hello, Driver!")
     print_package_version_info()
-    hello_worker.remote()
+    ray.get(hello_worker.remote())
+    df = dc.read("hello_world")
+    print("=== Table Written by Ray Worker ===")
+    print(df)
 if __name__ == "__main__":
     # initialize deltacat
-    deltacat.init()
+    # Catalog files will be stored in .deltacat/ in the current working directory.
+    dc.init_local()
     # run the example
     run()

deltacat/examples/indexer/indexer.py CHANGED Viewed

@@ -90,6 +90,9 @@ def run(
 if __name__ == "__main__":
     """
+    This example script demonstrates how to use the `deltacat.copy` API to copy multimodal source files into
+    arbitrary destinations with optional file format conversion and UDF transformations using DeltaCAT URLs.
     Example 1: Run this script locally using Ray:
     $ python indexer.py \
     $   --source 'text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31' \

deltacat/examples/indexer/job_runner.py CHANGED Viewed

@@ -105,7 +105,12 @@ def run(
 if __name__ == "__main__":
     """
-    # Run this example through a command of the form:
+    This example shows how to submit jobs to a remote Ray cluster that indexes source files into arbitrary destinations with
+    optional file format conversion using DeltaCAT URLs. It provides the option to run multiple sequential or concurrent jobs
+    for benchmarking.
+    # For example, the following command launches a remote Ray Cluster on AWS, downloads an external OpenAlex dataset text file,
+    # converts it to Parquet, and writes it back to AWS S3. It submits 100 jobs in parallel, each with a timeout of 90 seconds:
     $ python ./deltacat/examples/job_runner.py -- \
     $ --source text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31 \
     $ --dest parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet \

deltacat/storage/model/schema.py CHANGED Viewed

@@ -23,6 +23,7 @@ from deltacat.exceptions import (
     SchemaValidationError,
 )
 from deltacat.storage.model.types import (
+    LocalTable,
     SchemaConsistencyType,
     SortOrder,
     NullOrder,
@@ -30,6 +31,7 @@ from deltacat.storage.model.types import (
 from deltacat.types.tables import (
     get_table_length,
     to_pyarrow,
+    get_table_column_names,
     from_pyarrow,
     get_dataset_type,
     SchemaEvolutionMode,
@@ -1174,8 +1176,7 @@ class Schema(dict):
     def coerce(
         self,
-        dataset: Union[pa.Table, pd.DataFrame, np.ndarray, Any],
-        manifest_entry_schema: Optional[Schema] = None,
+        dataset: LocalTable,
     ) -> Union[pa.Table, pd.DataFrame, np.ndarray, Any]:
         """Coerce a dataset to match this schema using field type promotion.
@@ -1196,7 +1197,6 @@ class Schema(dict):
         Args:
             dataset: Dataset to coerce to this schema
-            manifest_entry_schema: Original manifest entry schema used to write the dataset.
         Returns:
             Dataset of the same type, coerced to match this schema.
@@ -1208,10 +1208,23 @@ class Schema(dict):
             # No fields defined in schema, return original dataset
             return dataset
+        # Create pyarrow schema of fields common to the table schema and input dataset
+        common_fields = []
+        dataset_column_names = [
+            name.lower() for name in get_table_column_names(dataset)
+        ]
+        for field in self.fields:
+            if field.arrow.name.lower() in dataset_column_names:
+                common_fields.append(field.arrow)
+        # If no common fields, return original dataset
+        if not common_fields:
+            return dataset
+        common_schema = pa.schema(common_fields)
         # Convert dataset to PyArrow table for processing
         pa_table = to_pyarrow(
             dataset,
-            schema=manifest_entry_schema.arrow if manifest_entry_schema else None,
+            schema=common_schema,
         )
         # Process columns using field coercion

deltacat/tests/aws/test_s3u.py CHANGED Viewed

@@ -35,7 +35,15 @@ class TestUuidBlockWritePathProvider(unittest.TestCase):
         result = provider("base_path")
         self.assertTrue(isinstance(provider, FilenameProvider))
-        self.assertRegex(result, r"^base_path/[\w-]{36}$")
+        # assert that the result is a valid UUID
+        self.assertRegex(
+            result, r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"
+        )
+        # after deleting the provider, expect to capture one write path with the base path as the prefix
+        del provider
+        write_paths = capture_object.write_paths()
+        self.assertEqual(len(write_paths), 1)
+        self.assertEqual(write_paths[0], f"base_path/{result}")
 class TestDownloadUpload(unittest.TestCase):

deltacat 2.0.0.post1__py3-none-any.whl → 2.0.0.post2__py3-none-any.whl

deltacat 2.0.0.post1py3-none-any.whl → 2.0.0.post2py3-none-any.whl