deltacat 2.0.0.post1__py3-none-any.whl → 2.0.0.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/api.py +44 -7
- deltacat/catalog/main/impl.py +34 -110
- deltacat/examples/hello_world.py +10 -4
- deltacat/examples/indexer/indexer.py +3 -0
- deltacat/examples/indexer/job_runner.py +6 -1
- deltacat/storage/model/schema.py +17 -4
- deltacat/tests/aws/test_s3u.py +9 -1
- deltacat/tests/catalog/test_default_catalog_impl.py +198 -7
- deltacat/types/media.py +282 -0
- deltacat/types/tables.py +5 -11
- deltacat/utils/pandas.py +11 -3
- deltacat/utils/polars.py +3 -1
- deltacat/utils/pyarrow.py +7 -3
- deltacat/utils/url.py +22 -0
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/METADATA +161 -47
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/RECORD +20 -20
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/licenses/LICENSE +0 -0
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
deltacat/api.py
CHANGED
@@ -28,7 +28,10 @@ from deltacat.storage import (
|
|
28
28
|
LocalTable,
|
29
29
|
Metafile,
|
30
30
|
)
|
31
|
-
from deltacat.types.media import
|
31
|
+
from deltacat.types.media import (
|
32
|
+
DatasetType,
|
33
|
+
DatastoreType,
|
34
|
+
)
|
32
35
|
from deltacat.utils.url import (
|
33
36
|
DeltaCatUrl,
|
34
37
|
DeltaCatUrlReader,
|
@@ -83,8 +86,8 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
83
86
|
|
84
87
|
|
85
88
|
def copy(
|
86
|
-
src: DeltaCatUrl,
|
87
|
-
dst: DeltaCatUrl,
|
89
|
+
src: Union[DeltaCatUrl, str],
|
90
|
+
dst: Union[DeltaCatUrl, str],
|
88
91
|
*,
|
89
92
|
transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
|
90
93
|
extension_to_memory_multiplier: Dict[str, float] = {
|
@@ -153,6 +156,8 @@ def copy(
|
|
153
156
|
Returns:
|
154
157
|
None
|
155
158
|
"""
|
159
|
+
src = _resolve_url(src)
|
160
|
+
dst = _resolve_url(dst)
|
156
161
|
if src.is_deltacat_catalog_url() or dst.is_deltacat_catalog_url():
|
157
162
|
return _copy_dc(src, dst, recursive=src.url.endswith("/**"))
|
158
163
|
else:
|
@@ -305,12 +310,13 @@ class CustomReadKwargsProvider(ReadKwargsProvider):
|
|
305
310
|
|
306
311
|
|
307
312
|
def list(
|
308
|
-
url: DeltaCatUrl,
|
313
|
+
url: Union[DeltaCatUrl, str],
|
309
314
|
*,
|
310
315
|
recursive: bool = False,
|
311
316
|
dataset_type: Optional[DatasetType] = None,
|
312
317
|
**kwargs,
|
313
318
|
) -> Union[List[Metafile], LocalTable, DistributedDataset]:
|
319
|
+
url = _resolve_url(url)
|
314
320
|
if not url.is_deltacat_catalog_url():
|
315
321
|
raise NotImplementedError("List only supports DeltaCAT Catalog URLs.")
|
316
322
|
if dataset_type in DatasetType.distributed():
|
@@ -345,21 +351,52 @@ def list(
|
|
345
351
|
)
|
346
352
|
|
347
353
|
|
354
|
+
def _resolve_url(url: Union[DeltaCatUrl, str]) -> DeltaCatUrl:
|
355
|
+
if isinstance(url, str):
|
356
|
+
try:
|
357
|
+
url = DeltaCatUrl(url)
|
358
|
+
except ValueError:
|
359
|
+
url = DatastoreType.get_url(url)
|
360
|
+
url = DeltaCatUrl(url)
|
361
|
+
return url
|
362
|
+
|
363
|
+
|
348
364
|
def get(
|
349
|
-
url,
|
365
|
+
url: Union[DeltaCatUrl, str],
|
366
|
+
read_as: DatasetType = DatasetType.RAY_DATASET,
|
350
367
|
*args,
|
351
368
|
**kwargs,
|
352
369
|
) -> Union[Metafile, Dataset]:
|
353
|
-
|
370
|
+
"""
|
371
|
+
Reads a DeltaCAT URL into a Metafile or Dataset. DeltaCAT URLs can either
|
372
|
+
reference objects registered in a DeltaCAT catalog, or unregistered external
|
373
|
+
objects that are readable into a Dataset. DeltaCAT automatically infers the right
|
374
|
+
Ray Data reader for the URL. If the URL is an unregistered external object,
|
375
|
+
the reader will be inferred from the URL's datastore type.
|
376
|
+
|
377
|
+
Args:
|
378
|
+
url: The DeltaCAT URL to read.
|
379
|
+
read_as: The DatasetType to read an unregistered external object as. Ignored for
|
380
|
+
registered DeltaCAT objects. Defaults to DatasetType.RAY_DATASET.
|
381
|
+
args: Additional arguments to pass to the reader.
|
382
|
+
kwargs: Additional keyword arguments to pass to the reader.
|
383
|
+
|
384
|
+
Returns:
|
385
|
+
A Metafile for registered DeltaCAT URLs or a Dataset containing the
|
386
|
+
data from the URL.
|
387
|
+
"""
|
388
|
+
url = _resolve_url(url)
|
389
|
+
reader = DeltaCatUrlReader(url, dataset_type=read_as)
|
354
390
|
return reader.read(*args, **kwargs)
|
355
391
|
|
356
392
|
|
357
393
|
def put(
|
358
|
-
url: DeltaCatUrl,
|
394
|
+
url: Union[DeltaCatUrl, str],
|
359
395
|
metafile: Optional[Metafile] = None,
|
360
396
|
*args,
|
361
397
|
**kwargs,
|
362
398
|
) -> Union[Metafile, str]:
|
399
|
+
url = _resolve_url(url)
|
363
400
|
writer = DeltaCatUrlWriter(url, metafile=metafile)
|
364
401
|
return writer.write(*args, **kwargs)
|
365
402
|
|
deltacat/catalog/main/impl.py
CHANGED
@@ -446,7 +446,9 @@ def write_to_table(
|
|
446
446
|
"transaction": write_transaction, # Pass transaction to update_table_version
|
447
447
|
}
|
448
448
|
|
449
|
-
_get_storage(
|
449
|
+
_, updated_table_version_obj, _ = _get_storage(
|
450
|
+
**catalog_kwargs
|
451
|
+
).update_table_version(
|
450
452
|
namespace=namespace,
|
451
453
|
table_name=table,
|
452
454
|
table_version=table_version_obj.table_version,
|
@@ -465,9 +467,9 @@ def write_to_table(
|
|
465
467
|
content_type,
|
466
468
|
commit_staged_partition,
|
467
469
|
table_version_obj,
|
470
|
+
updated_table_version_obj if schema_modified else None,
|
468
471
|
namespace,
|
469
472
|
table,
|
470
|
-
schema=updated_schema if schema_modified else table_version_obj.schema,
|
471
473
|
original_fields=original_fields,
|
472
474
|
**filtered_kwargs,
|
473
475
|
)
|
@@ -743,61 +745,6 @@ def _convert_numpy_for_schema_validation(
|
|
743
745
|
)
|
744
746
|
|
745
747
|
|
746
|
-
def _build_entry_index_to_schema_mapping(
|
747
|
-
qualified_deltas: List[Delta], table_version_obj, **kwargs
|
748
|
-
) -> List[Schema]:
|
749
|
-
"""Build a mapping from manifest entry index to schema for reading operations.
|
750
|
-
|
751
|
-
Args:
|
752
|
-
qualified_deltas: List of deltas to process
|
753
|
-
table_version_obj: Table version containing schemas
|
754
|
-
**kwargs: Additional arguments passed to storage operations
|
755
|
-
|
756
|
-
Returns:
|
757
|
-
List mapping each manifest entry index to its corresponding schema
|
758
|
-
|
759
|
-
Raises:
|
760
|
-
ValueError: If a manifest's schema ID is not found in table version schemas
|
761
|
-
"""
|
762
|
-
entry_index_to_schema = []
|
763
|
-
for delta in qualified_deltas:
|
764
|
-
if delta.manifest:
|
765
|
-
manifest = delta.manifest
|
766
|
-
else:
|
767
|
-
# Fetch manifest from storage
|
768
|
-
manifest = _get_storage(**kwargs).get_delta_manifest(
|
769
|
-
delta.locator,
|
770
|
-
**kwargs,
|
771
|
-
)
|
772
|
-
# Map manifest entry index to schema ID
|
773
|
-
schema_id = manifest.meta.schema_id
|
774
|
-
|
775
|
-
# Find the schema that matches this manifest's schema_id
|
776
|
-
matching_schema = None
|
777
|
-
if table_version_obj.schemas:
|
778
|
-
for schema in table_version_obj.schemas:
|
779
|
-
if schema.id == schema_id:
|
780
|
-
matching_schema = schema
|
781
|
-
break
|
782
|
-
|
783
|
-
if matching_schema is None:
|
784
|
-
available_schema_ids = (
|
785
|
-
[s.id for s in table_version_obj.schemas]
|
786
|
-
if table_version_obj.schemas
|
787
|
-
else []
|
788
|
-
)
|
789
|
-
raise ValueError(
|
790
|
-
f"Manifest schema ID {schema_id} not found in table version schemas. "
|
791
|
-
f"Available schema IDs: {available_schema_ids}. "
|
792
|
-
)
|
793
|
-
|
794
|
-
# Add the matching schema for each entry in this manifest
|
795
|
-
for _ in range(len(manifest.entries)):
|
796
|
-
entry_index_to_schema.append(matching_schema)
|
797
|
-
|
798
|
-
return entry_index_to_schema
|
799
|
-
|
800
|
-
|
801
748
|
def _convert_data_if_needed(data: Dataset) -> Dataset:
|
802
749
|
"""Convert unsupported data types to supported ones."""
|
803
750
|
if isinstance(data, daft.DataFrame):
|
@@ -950,10 +897,10 @@ def _stage_commit_and_compact(
|
|
950
897
|
delta_type: DeltaType,
|
951
898
|
content_type: ContentType,
|
952
899
|
commit_staged_partition: bool,
|
953
|
-
|
900
|
+
original_table_version_obj: TableVersion,
|
901
|
+
updated_table_version_obj: Optional[TableVersion],
|
954
902
|
namespace: str,
|
955
903
|
table: str,
|
956
|
-
schema: Schema,
|
957
904
|
original_fields: Set[str],
|
958
905
|
**kwargs,
|
959
906
|
) -> None:
|
@@ -962,6 +909,12 @@ def _stage_commit_and_compact(
|
|
962
909
|
# We explicitly pass the correct schema parameter
|
963
910
|
kwargs.pop("schema", None)
|
964
911
|
|
912
|
+
resolved_table_version_obj = (
|
913
|
+
updated_table_version_obj
|
914
|
+
if updated_table_version_obj
|
915
|
+
else original_table_version_obj
|
916
|
+
)
|
917
|
+
|
965
918
|
# Stage a delta with the data
|
966
919
|
delta = _get_storage(**kwargs).stage_delta(
|
967
920
|
data=converted_data,
|
@@ -971,7 +924,7 @@ def _stage_commit_and_compact(
|
|
971
924
|
author=ManifestAuthor.of(
|
972
925
|
name="deltacat.write_to_table", version=dc.__version__
|
973
926
|
),
|
974
|
-
schema=schema,
|
927
|
+
schema=resolved_table_version_obj.schema,
|
975
928
|
**kwargs,
|
976
929
|
)
|
977
930
|
|
@@ -982,25 +935,26 @@ def _stage_commit_and_compact(
|
|
982
935
|
|
983
936
|
# Check compaction trigger decision
|
984
937
|
should_compact = _trigger_compaction(
|
985
|
-
|
938
|
+
resolved_table_version_obj,
|
986
939
|
delta,
|
987
940
|
TableReadOptimizationLevel.MAX,
|
988
941
|
**kwargs,
|
989
942
|
)
|
990
943
|
if should_compact:
|
991
944
|
# Run V2 compaction session to merge or delete data
|
992
|
-
if
|
993
|
-
all_column_names = table_version_obj.schema.arrow.names
|
994
|
-
else:
|
945
|
+
if not original_table_version_obj.schema:
|
995
946
|
raise RuntimeError("Table version schema is required to run compaction.")
|
947
|
+
original_table_version_column_names = (
|
948
|
+
original_table_version_obj.schema.arrow.names
|
949
|
+
)
|
996
950
|
_run_compaction_session(
|
997
|
-
table_version_obj=
|
951
|
+
table_version_obj=resolved_table_version_obj,
|
998
952
|
partition=partition,
|
999
953
|
latest_delta_stream_position=delta.stream_position,
|
1000
954
|
namespace=namespace,
|
1001
955
|
table=table,
|
1002
956
|
original_fields=original_fields,
|
1003
|
-
|
957
|
+
original_table_version_column_names=original_table_version_column_names,
|
1004
958
|
**kwargs,
|
1005
959
|
)
|
1006
960
|
|
@@ -1232,7 +1186,7 @@ def _run_compaction_session(
|
|
1232
1186
|
namespace: str,
|
1233
1187
|
table: str,
|
1234
1188
|
original_fields: Set[str],
|
1235
|
-
|
1189
|
+
original_table_version_column_names: List[str],
|
1236
1190
|
**kwargs,
|
1237
1191
|
) -> None:
|
1238
1192
|
"""
|
@@ -1254,7 +1208,8 @@ def _run_compaction_session(
|
|
1254
1208
|
# Extract compaction configuration
|
1255
1209
|
primary_keys = _get_compaction_primary_keys(table_version_obj)
|
1256
1210
|
hash_bucket_count = _get_compaction_hash_bucket_count(
|
1257
|
-
partition,
|
1211
|
+
partition,
|
1212
|
+
table_version_obj,
|
1258
1213
|
)
|
1259
1214
|
|
1260
1215
|
# Create compaction parameters
|
@@ -1265,7 +1220,7 @@ def _run_compaction_session(
|
|
1265
1220
|
primary_keys,
|
1266
1221
|
hash_bucket_count,
|
1267
1222
|
original_fields=original_fields,
|
1268
|
-
all_column_names=
|
1223
|
+
all_column_names=original_table_version_column_names,
|
1269
1224
|
**kwargs,
|
1270
1225
|
)
|
1271
1226
|
|
@@ -1499,10 +1454,6 @@ def _download_and_process_table_data(
|
|
1499
1454
|
return _convert_pandas_to_numpy(result)
|
1500
1455
|
return result
|
1501
1456
|
|
1502
|
-
# Get schemas for each manifest entry
|
1503
|
-
entry_index_to_schema = _build_entry_index_to_schema_mapping(
|
1504
|
-
qualified_deltas, table_version_obj, **kwargs
|
1505
|
-
)
|
1506
1457
|
# Standard non-empty schema table read path - merge deltas and download data
|
1507
1458
|
merged_delta = Delta.merge_deltas(qualified_deltas)
|
1508
1459
|
|
@@ -1570,11 +1521,10 @@ def _download_and_process_table_data(
|
|
1570
1521
|
result,
|
1571
1522
|
table_type,
|
1572
1523
|
table_version_obj.schema,
|
1573
|
-
entry_index_to_schema,
|
1574
1524
|
file_path_column,
|
1575
1525
|
columns,
|
1576
1526
|
)
|
1577
|
-
# Convert to numpy if original request was for numpy
|
1527
|
+
# Convert pandas to numpy if original request was for numpy
|
1578
1528
|
if original_read_as == DatasetType.NUMPY:
|
1579
1529
|
return _convert_pandas_to_numpy(result)
|
1580
1530
|
|
@@ -1589,22 +1539,25 @@ def _convert_pandas_to_numpy(dataset: Dataset):
|
|
1589
1539
|
|
1590
1540
|
|
1591
1541
|
def _coerce_dataset_to_schema(
|
1592
|
-
dataset: Dataset,
|
1542
|
+
dataset: Dataset,
|
1543
|
+
target_schema: pa.Schema,
|
1593
1544
|
) -> Dataset:
|
1594
1545
|
"""Coerce a dataset to match the target PyArrow schema using DeltaCAT Schema.coerce method."""
|
1595
1546
|
# Convert target PyArrow schema to DeltaCAT schema and use its coerce method
|
1596
1547
|
deltacat_schema = Schema.of(schema=target_schema)
|
1597
|
-
return deltacat_schema.coerce(dataset
|
1548
|
+
return deltacat_schema.coerce(dataset)
|
1598
1549
|
|
1599
1550
|
|
1600
1551
|
def _coerce_results_to_schema(
|
1601
|
-
results: Dataset,
|
1552
|
+
results: Dataset,
|
1553
|
+
target_schema: pa.Schema,
|
1602
1554
|
) -> List[Dataset]:
|
1603
1555
|
"""Coerce all table results to match the target schema."""
|
1604
1556
|
coerced_results = []
|
1605
1557
|
for i, table_result in enumerate(results):
|
1606
1558
|
coerced_result = _coerce_dataset_to_schema(
|
1607
|
-
table_result,
|
1559
|
+
table_result,
|
1560
|
+
target_schema,
|
1608
1561
|
)
|
1609
1562
|
coerced_results.append(coerced_result)
|
1610
1563
|
logger.debug(f"Coerced table {i} to unified schema")
|
@@ -1631,35 +1584,10 @@ def _create_target_schema(
|
|
1631
1584
|
return arrow_schema
|
1632
1585
|
|
1633
1586
|
|
1634
|
-
def _create_entry_schemas_for_concatenation(
|
1635
|
-
entry_index_to_schema: List[Schema],
|
1636
|
-
columns: Optional[List[str]] = None,
|
1637
|
-
file_path_column: Optional[str] = None,
|
1638
|
-
) -> List[Schema]:
|
1639
|
-
"""Create entry schemas for concatenation, optionally filtered by column selection."""
|
1640
|
-
if columns is None:
|
1641
|
-
# No column selection - return original schemas as-is
|
1642
|
-
return entry_index_to_schema
|
1643
|
-
|
1644
|
-
# Column selection - filter each entry schema
|
1645
|
-
modified_schemas = []
|
1646
|
-
for entry_schema in entry_index_to_schema:
|
1647
|
-
if entry_schema and entry_schema.arrow:
|
1648
|
-
filtered_schema = _create_target_schema(
|
1649
|
-
entry_schema.arrow, columns, file_path_column
|
1650
|
-
)
|
1651
|
-
modified_schemas.append(Schema.of(schema=filtered_schema))
|
1652
|
-
else:
|
1653
|
-
modified_schemas.append(entry_schema)
|
1654
|
-
|
1655
|
-
return modified_schemas
|
1656
|
-
|
1657
|
-
|
1658
1587
|
def _handle_local_table_concatenation(
|
1659
1588
|
results: Dataset,
|
1660
1589
|
table_type: DatasetType,
|
1661
1590
|
table_schema: Optional[Schema],
|
1662
|
-
entry_index_to_schema: List[Schema],
|
1663
1591
|
file_path_column: Optional[str] = None,
|
1664
1592
|
columns: Optional[List[str]] = None,
|
1665
1593
|
) -> Dataset:
|
@@ -1670,14 +1598,10 @@ def _handle_local_table_concatenation(
|
|
1670
1598
|
target_schema = _create_target_schema(table_schema.arrow, columns, file_path_column)
|
1671
1599
|
logger.debug(f"Created target schema: {target_schema.names}")
|
1672
1600
|
|
1673
|
-
# Filter entry schemas to match column selection and file_path_column
|
1674
|
-
modified_entry_schemas = _create_entry_schemas_for_concatenation(
|
1675
|
-
entry_index_to_schema, columns, file_path_column
|
1676
|
-
)
|
1677
|
-
|
1678
1601
|
# Coerce results to unified schema
|
1679
1602
|
coerced_results = _coerce_results_to_schema(
|
1680
|
-
results,
|
1603
|
+
results,
|
1604
|
+
target_schema,
|
1681
1605
|
)
|
1682
1606
|
|
1683
1607
|
# Second step: concatenate the coerced results
|
deltacat/examples/hello_world.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
import ray
|
2
|
-
import deltacat
|
2
|
+
import deltacat as dc
|
3
3
|
import daft
|
4
4
|
|
5
5
|
|
6
6
|
def print_package_version_info():
|
7
|
-
print(f"DeltaCAT Version: {
|
7
|
+
print(f"DeltaCAT Version: {dc.__version__}")
|
8
8
|
print(f"Ray Version: {ray.__version__}")
|
9
9
|
print(f"Daft Version: {daft.__version__}")
|
10
10
|
|
@@ -12,18 +12,24 @@ def print_package_version_info():
|
|
12
12
|
@ray.remote
|
13
13
|
def hello_worker():
|
14
14
|
print("Hello, Worker!")
|
15
|
+
df = daft.from_pydict({"hello": ["delta", "cat"]})
|
16
|
+
dc.write(df, "hello_world")
|
15
17
|
print_package_version_info()
|
16
18
|
|
17
19
|
|
18
20
|
def run():
|
19
21
|
print("Hello, Driver!")
|
20
22
|
print_package_version_info()
|
21
|
-
hello_worker.remote()
|
23
|
+
ray.get(hello_worker.remote())
|
24
|
+
df = dc.read("hello_world")
|
25
|
+
print("=== Table Written by Ray Worker ===")
|
26
|
+
print(df)
|
22
27
|
|
23
28
|
|
24
29
|
if __name__ == "__main__":
|
25
30
|
# initialize deltacat
|
26
|
-
deltacat.
|
31
|
+
# Catalog files will be stored in .deltacat/ in the current working directory.
|
32
|
+
dc.init_local()
|
27
33
|
|
28
34
|
# run the example
|
29
35
|
run()
|
@@ -90,6 +90,9 @@ def run(
|
|
90
90
|
|
91
91
|
if __name__ == "__main__":
|
92
92
|
"""
|
93
|
+
This example script demonstrates how to use the `deltacat.copy` API to copy multimodal source files into
|
94
|
+
arbitrary destinations with optional file format conversion and UDF transformations using DeltaCAT URLs.
|
95
|
+
|
93
96
|
Example 1: Run this script locally using Ray:
|
94
97
|
$ python indexer.py \
|
95
98
|
$ --source 'text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31' \
|
@@ -105,7 +105,12 @@ def run(
|
|
105
105
|
|
106
106
|
if __name__ == "__main__":
|
107
107
|
"""
|
108
|
-
|
108
|
+
This example shows how to submit jobs to a remote Ray cluster that indexes source files into arbitrary destinations with
|
109
|
+
optional file format conversion using DeltaCAT URLs. It provides the option to run multiple sequential or concurrent jobs
|
110
|
+
for benchmarking.
|
111
|
+
|
112
|
+
# For example, the following command launches a remote Ray Cluster on AWS, downloads an external OpenAlex dataset text file,
|
113
|
+
# converts it to Parquet, and writes it back to AWS S3. It submits 100 jobs in parallel, each with a timeout of 90 seconds:
|
109
114
|
$ python ./deltacat/examples/job_runner.py -- \
|
110
115
|
$ --source text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31 \
|
111
116
|
$ --dest parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet \
|
deltacat/storage/model/schema.py
CHANGED
@@ -23,6 +23,7 @@ from deltacat.exceptions import (
|
|
23
23
|
SchemaValidationError,
|
24
24
|
)
|
25
25
|
from deltacat.storage.model.types import (
|
26
|
+
LocalTable,
|
26
27
|
SchemaConsistencyType,
|
27
28
|
SortOrder,
|
28
29
|
NullOrder,
|
@@ -30,6 +31,7 @@ from deltacat.storage.model.types import (
|
|
30
31
|
from deltacat.types.tables import (
|
31
32
|
get_table_length,
|
32
33
|
to_pyarrow,
|
34
|
+
get_table_column_names,
|
33
35
|
from_pyarrow,
|
34
36
|
get_dataset_type,
|
35
37
|
SchemaEvolutionMode,
|
@@ -1174,8 +1176,7 @@ class Schema(dict):
|
|
1174
1176
|
|
1175
1177
|
def coerce(
|
1176
1178
|
self,
|
1177
|
-
dataset:
|
1178
|
-
manifest_entry_schema: Optional[Schema] = None,
|
1179
|
+
dataset: LocalTable,
|
1179
1180
|
) -> Union[pa.Table, pd.DataFrame, np.ndarray, Any]:
|
1180
1181
|
"""Coerce a dataset to match this schema using field type promotion.
|
1181
1182
|
|
@@ -1196,7 +1197,6 @@ class Schema(dict):
|
|
1196
1197
|
|
1197
1198
|
Args:
|
1198
1199
|
dataset: Dataset to coerce to this schema
|
1199
|
-
manifest_entry_schema: Original manifest entry schema used to write the dataset.
|
1200
1200
|
|
1201
1201
|
Returns:
|
1202
1202
|
Dataset of the same type, coerced to match this schema.
|
@@ -1208,10 +1208,23 @@ class Schema(dict):
|
|
1208
1208
|
# No fields defined in schema, return original dataset
|
1209
1209
|
return dataset
|
1210
1210
|
|
1211
|
+
# Create pyarrow schema of fields common to the table schema and input dataset
|
1212
|
+
common_fields = []
|
1213
|
+
dataset_column_names = [
|
1214
|
+
name.lower() for name in get_table_column_names(dataset)
|
1215
|
+
]
|
1216
|
+
for field in self.fields:
|
1217
|
+
if field.arrow.name.lower() in dataset_column_names:
|
1218
|
+
common_fields.append(field.arrow)
|
1219
|
+
# If no common fields, return original dataset
|
1220
|
+
if not common_fields:
|
1221
|
+
return dataset
|
1222
|
+
common_schema = pa.schema(common_fields)
|
1223
|
+
|
1211
1224
|
# Convert dataset to PyArrow table for processing
|
1212
1225
|
pa_table = to_pyarrow(
|
1213
1226
|
dataset,
|
1214
|
-
schema=
|
1227
|
+
schema=common_schema,
|
1215
1228
|
)
|
1216
1229
|
|
1217
1230
|
# Process columns using field coercion
|
deltacat/tests/aws/test_s3u.py
CHANGED
@@ -35,7 +35,15 @@ class TestUuidBlockWritePathProvider(unittest.TestCase):
|
|
35
35
|
result = provider("base_path")
|
36
36
|
|
37
37
|
self.assertTrue(isinstance(provider, FilenameProvider))
|
38
|
-
|
38
|
+
# assert that the result is a valid UUID
|
39
|
+
self.assertRegex(
|
40
|
+
result, r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"
|
41
|
+
)
|
42
|
+
# after deleting the provider, expect to capture one write path with the base path as the prefix
|
43
|
+
del provider
|
44
|
+
write_paths = capture_object.write_paths()
|
45
|
+
self.assertEqual(len(write_paths), 1)
|
46
|
+
self.assertEqual(write_paths[0], f"base_path/{result}")
|
39
47
|
|
40
48
|
|
41
49
|
class TestDownloadUpload(unittest.TestCase):
|