deltacat 2.0.0.post1__py3-none-any.whl → 2.0.0.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/api.py +44 -7
- deltacat/catalog/main/impl.py +34 -110
- deltacat/examples/hello_world.py +10 -4
- deltacat/examples/indexer/indexer.py +3 -0
- deltacat/examples/indexer/job_runner.py +6 -1
- deltacat/storage/model/schema.py +17 -4
- deltacat/tests/aws/test_s3u.py +9 -1
- deltacat/tests/catalog/test_default_catalog_impl.py +198 -7
- deltacat/types/media.py +282 -0
- deltacat/types/tables.py +5 -11
- deltacat/utils/pandas.py +11 -3
- deltacat/utils/polars.py +3 -1
- deltacat/utils/pyarrow.py +7 -3
- deltacat/utils/url.py +22 -0
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/METADATA +161 -47
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/RECORD +20 -20
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/licenses/LICENSE +0 -0
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/top_level.txt +0 -0
@@ -1048,11 +1048,11 @@ class TestCopyOnWrite:
|
|
1048
1048
|
"dataset_type",
|
1049
1049
|
[
|
1050
1050
|
DatasetType.PANDAS,
|
1051
|
-
DatasetType.PYARROW,
|
1052
|
-
DatasetType.POLARS,
|
1053
|
-
DatasetType.DAFT,
|
1054
|
-
DatasetType.RAY_DATASET,
|
1055
|
-
DatasetType.NUMPY,
|
1051
|
+
DatasetType.PYARROW,
|
1052
|
+
DatasetType.POLARS,
|
1053
|
+
DatasetType.DAFT,
|
1054
|
+
DatasetType.RAY_DATASET,
|
1055
|
+
DatasetType.NUMPY,
|
1056
1056
|
],
|
1057
1057
|
)
|
1058
1058
|
def test_partial_upsert_all_dataset_types(self, dataset_type):
|
@@ -1114,8 +1114,14 @@ class TestCopyOnWrite:
|
|
1114
1114
|
table=table_name,
|
1115
1115
|
namespace=self.test_namespace,
|
1116
1116
|
catalog=self.catalog_name,
|
1117
|
+
read_as=dataset_type,
|
1118
|
+
)
|
1119
|
+
table = dc.get_table(
|
1120
|
+
table_name,
|
1121
|
+
catalog=self.catalog_name,
|
1122
|
+
namespace=self.test_namespace,
|
1117
1123
|
)
|
1118
|
-
result_df =
|
1124
|
+
result_df = dc.to_pandas(result, schema=table.table_version.schema.arrow)
|
1119
1125
|
|
1120
1126
|
# Verify results
|
1121
1127
|
assert len(result_df) == 4, f"Should have 4 records ({dataset_type.value})"
|
@@ -1239,8 +1245,14 @@ class TestCopyOnWrite:
|
|
1239
1245
|
table=table_name,
|
1240
1246
|
namespace=self.test_namespace,
|
1241
1247
|
catalog=self.catalog_name,
|
1248
|
+
read_as=dataset_type,
|
1249
|
+
)
|
1250
|
+
table = dc.get_table(
|
1251
|
+
table_name,
|
1252
|
+
catalog=self.catalog_name,
|
1253
|
+
namespace=self.test_namespace,
|
1242
1254
|
)
|
1243
|
-
result_df =
|
1255
|
+
result_df = dc.to_pandas(result, schema=table.table_version.schema.arrow)
|
1244
1256
|
|
1245
1257
|
# Verify results
|
1246
1258
|
assert len(result_df) == 4, f"Should have 4 records ({dataset_type.value})"
|
@@ -1401,6 +1413,185 @@ class TestCopyOnWrite:
|
|
1401
1413
|
|
1402
1414
|
self._verify_dataframe_contents(result, expected_final_data)
|
1403
1415
|
|
1416
|
+
def test_schema_evolution_delta_manifest_schema_ids(self):
|
1417
|
+
"""
|
1418
|
+
Test that delta manifest entries record correct schema IDs during schema evolution.
|
1419
|
+
|
1420
|
+
This test verifies the fix for the issue where MERGE operations with new columns
|
1421
|
+
were recording incorrect schema IDs in delta manifest entries, causing reads
|
1422
|
+
to use old schemas instead of evolved schemas.
|
1423
|
+
"""
|
1424
|
+
from deltacat.storage.model.metafile import Metafile
|
1425
|
+
from deltacat.storage.model.delta import Delta
|
1426
|
+
|
1427
|
+
table_name = "test_schema_evolution_manifest_ids"
|
1428
|
+
|
1429
|
+
# Step 1: Create table with merge keys (initial schema)
|
1430
|
+
self._create_table_with_merge_keys(table_name)
|
1431
|
+
|
1432
|
+
# Step 2: Write initial data using PyArrow for an exact match with the declared schema
|
1433
|
+
# This ensures that schema evolution isn't triggered by the first write (which would
|
1434
|
+
# result in 2 schemas created by the first write instead of 1)
|
1435
|
+
initial_data = pa.table(
|
1436
|
+
{
|
1437
|
+
"id": pa.array([1, 2, 3], type=pa.int64()),
|
1438
|
+
"name": pa.array(["Alice", "Bob", "Charlie"], type=pa.string()),
|
1439
|
+
"age": pa.array([25, 30, 35], type=pa.int32()),
|
1440
|
+
"city": pa.array(["NYC", "LA", "Chicago"], type=pa.string()),
|
1441
|
+
}
|
1442
|
+
)
|
1443
|
+
dc.write_to_table(
|
1444
|
+
data=initial_data,
|
1445
|
+
table=table_name,
|
1446
|
+
namespace=self.test_namespace,
|
1447
|
+
mode=TableWriteMode.MERGE,
|
1448
|
+
content_type=ContentType.PARQUET,
|
1449
|
+
catalog=self.catalog_name,
|
1450
|
+
)
|
1451
|
+
|
1452
|
+
# Step 3: Write MERGE data with NEW COLUMNS (triggers schema evolution)
|
1453
|
+
merge_data = pa.table(
|
1454
|
+
{
|
1455
|
+
"id": pa.array([1, 2, 4], type=pa.int64()), # Update existing + add new
|
1456
|
+
"salary": pa.array(
|
1457
|
+
[50000, 60000, 55000], type=pa.int64()
|
1458
|
+
), # NEW COLUMN
|
1459
|
+
"department": pa.array(
|
1460
|
+
["Engineering", "Sales", "Marketing"], type=pa.string()
|
1461
|
+
), # NEW COLUMN
|
1462
|
+
}
|
1463
|
+
)
|
1464
|
+
|
1465
|
+
dc.write_to_table(
|
1466
|
+
data=merge_data,
|
1467
|
+
table=table_name,
|
1468
|
+
namespace=self.test_namespace,
|
1469
|
+
mode=TableWriteMode.MERGE,
|
1470
|
+
content_type=ContentType.PARQUET,
|
1471
|
+
catalog=self.catalog_name,
|
1472
|
+
)
|
1473
|
+
|
1474
|
+
# Writing the same data again shouldn't trigger schema evolution
|
1475
|
+
dc.write_to_table(
|
1476
|
+
data=merge_data,
|
1477
|
+
table=table_name,
|
1478
|
+
namespace=self.test_namespace,
|
1479
|
+
mode=TableWriteMode.MERGE,
|
1480
|
+
content_type=ContentType.PARQUET,
|
1481
|
+
catalog=self.catalog_name,
|
1482
|
+
)
|
1483
|
+
|
1484
|
+
# Step 4: Get table definition to access schema evolution history
|
1485
|
+
table_def = dc.get_table(
|
1486
|
+
table=table_name,
|
1487
|
+
namespace=self.test_namespace,
|
1488
|
+
catalog=self.catalog_name,
|
1489
|
+
)
|
1490
|
+
|
1491
|
+
all_schemas = table_def.table_version.schemas
|
1492
|
+
|
1493
|
+
# Verify we have schema evolution (should have 2 schemas: original + evolved)
|
1494
|
+
assert (
|
1495
|
+
len(all_schemas) == 2
|
1496
|
+
), f"Expected 2 schemas after evolution, got {len(all_schemas)}"
|
1497
|
+
|
1498
|
+
initial_schema = all_schemas[0] # Original schema
|
1499
|
+
evolved_schema = all_schemas[1] # Latest schema after evolution
|
1500
|
+
|
1501
|
+
initial_schema_id = initial_schema.id
|
1502
|
+
evolved_schema_id = evolved_schema.id
|
1503
|
+
|
1504
|
+
# Step 5: Extract schema IDs from delta manifest entries
|
1505
|
+
def extract_schema_ids_from_deltas(all_objects):
|
1506
|
+
"""Extract schema IDs from Delta objects by parsing manifest entries."""
|
1507
|
+
schema_ids = []
|
1508
|
+
for obj in all_objects:
|
1509
|
+
obj_type = Metafile.get_class(obj)
|
1510
|
+
if obj_type == Delta:
|
1511
|
+
delta_obj = obj
|
1512
|
+
# Access manifest entries to get schema IDs
|
1513
|
+
if delta_obj.manifest:
|
1514
|
+
manifest = delta_obj.manifest
|
1515
|
+
if manifest.entries:
|
1516
|
+
for i, entry in enumerate(manifest.entries):
|
1517
|
+
# Extract schema ID from manifest entry
|
1518
|
+
if entry.meta and entry.meta.schema_id is not None:
|
1519
|
+
schema_id_value = entry.meta.schema_id
|
1520
|
+
schema_ids.append(schema_id_value)
|
1521
|
+
return schema_ids
|
1522
|
+
|
1523
|
+
# Use dc.list with recursive=True to find all objects for this table
|
1524
|
+
table_url = dc.DeltaCatUrl(
|
1525
|
+
f"dc://{self.catalog_name}/{self.test_namespace}/{table_name}"
|
1526
|
+
)
|
1527
|
+
all_objects = dc.list(table_url, recursive=True)
|
1528
|
+
|
1529
|
+
# Extract schema IDs from all delta manifest entries
|
1530
|
+
manifest_schema_ids = extract_schema_ids_from_deltas(all_objects)
|
1531
|
+
|
1532
|
+
# Step 6: Verify schema ID correctness
|
1533
|
+
# We should have exactly 4 manifest entries (1 from first write + 3 from second write + 0 from third write)
|
1534
|
+
assert (
|
1535
|
+
len(manifest_schema_ids) == 4
|
1536
|
+
), f"Expected 4 manifest entries with schema IDs, got {len(manifest_schema_ids)}"
|
1537
|
+
|
1538
|
+
# Check if manifest schema IDs match table schema IDs
|
1539
|
+
table_schema_ids = {initial_schema_id, evolved_schema_id}
|
1540
|
+
manifest_schema_ids_set = set(manifest_schema_ids)
|
1541
|
+
|
1542
|
+
if table_schema_ids == manifest_schema_ids_set:
|
1543
|
+
# The first delta should use the initial schema ID
|
1544
|
+
initial_entries = [
|
1545
|
+
sid for sid in manifest_schema_ids if sid == initial_schema_id
|
1546
|
+
]
|
1547
|
+
assert (
|
1548
|
+
len(initial_entries) == 1
|
1549
|
+
), f"Expected 1 initial entry with schema ID {initial_schema_id}, but found {len(initial_entries)}"
|
1550
|
+
|
1551
|
+
# The second delta should use the evolved schema ID
|
1552
|
+
evolved_entries = [
|
1553
|
+
sid for sid in manifest_schema_ids if sid == evolved_schema_id
|
1554
|
+
]
|
1555
|
+
assert (
|
1556
|
+
len(evolved_entries) == 3
|
1557
|
+
), f"Expected 3 evolved entries with schema ID {evolved_schema_id}, but found {len(evolved_entries)}"
|
1558
|
+
else:
|
1559
|
+
# This should not happen with PyArrow tables - fail the test
|
1560
|
+
assert (
|
1561
|
+
False
|
1562
|
+
), f"Schema IDs should match. Table: {sorted(table_schema_ids)}, Manifest: {sorted(manifest_schema_ids_set)}"
|
1563
|
+
|
1564
|
+
# Step 7: Verify the data can be read correctly with evolved schema
|
1565
|
+
final_data = dc.to_pandas(
|
1566
|
+
dc.read_table(
|
1567
|
+
table=table_name,
|
1568
|
+
namespace=self.test_namespace,
|
1569
|
+
catalog=self.catalog_name,
|
1570
|
+
)
|
1571
|
+
)
|
1572
|
+
|
1573
|
+
# Should have all original columns plus new columns
|
1574
|
+
expected_columns = {"id", "name", "age", "city", "salary", "department"}
|
1575
|
+
actual_columns = set(final_data.columns)
|
1576
|
+
assert expected_columns.issubset(
|
1577
|
+
actual_columns
|
1578
|
+
), f"Missing columns: {expected_columns - actual_columns}"
|
1579
|
+
|
1580
|
+
# Verify data integrity - all records should have both old and new data
|
1581
|
+
assert (
|
1582
|
+
len(final_data) == 4
|
1583
|
+
), f"Expected 4 records after merge, got {len(final_data)}"
|
1584
|
+
|
1585
|
+
# Check that evolved columns are properly populated
|
1586
|
+
salary_values = final_data["salary"].dropna()
|
1587
|
+
dept_values = final_data["department"].dropna()
|
1588
|
+
assert (
|
1589
|
+
len(salary_values) >= 3
|
1590
|
+
), f"Expected salary values for at least 3 records, got {len(salary_values)}"
|
1591
|
+
assert (
|
1592
|
+
len(dept_values) >= 3
|
1593
|
+
), f"Expected department values for at least 3 records, got {len(dept_values)}"
|
1594
|
+
|
1404
1595
|
def test_append_delta_count_compaction(self):
|
1405
1596
|
"""Test that compaction is triggered by appended delta count for APPEND mode writes."""
|
1406
1597
|
table_name = "test_append_delta_compaction"
|
deltacat/types/media.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
1
3
|
from enum import Enum
|
2
4
|
from typing import Set, Dict
|
3
5
|
|
@@ -401,3 +403,283 @@ class DatastoreType(str, Enum):
|
|
401
403
|
WARC = "warc"
|
402
404
|
WEBDATASET = "webdataset"
|
403
405
|
XML = "xml"
|
406
|
+
|
407
|
+
def url(self, url: str) -> str:
|
408
|
+
"""
|
409
|
+
Returns a DeltaCAT URL string for this datastore type and the given base URL.
|
410
|
+
Typically, DeltaCAT URLs will be of the form <DatastoreType>+<URL>.
|
411
|
+
|
412
|
+
However, the following Datastore Types don't follow the <DatastoreType>+<URL> convention:
|
413
|
+
|
414
|
+
{DatastoreType.MONGO}: <mongodb_uri>?database=<db_name>&collection=<collection_name>&...
|
415
|
+
{DatastoreType.BIGQUERY}: bigquery://<project_id>/<dataset>?param1=val1&...
|
416
|
+
{DatastoreType.CLICKHOUSE}: <clickhouse_dsn>?table=<table_name>?param1=val1&...
|
417
|
+
{DatastoreType.DATABRICKS_TABLES}: databricks://<warehouse_id>?param1=val1&...
|
418
|
+
{DatastoreType.ICEBERG}: iceberg://<table_identifier>?param1=val1&...
|
419
|
+
|
420
|
+
Args:
|
421
|
+
url: The base URL to convert to a DeltaCAT URL.
|
422
|
+
|
423
|
+
Returns:
|
424
|
+
A DeltaCAT URL string for this datastore type and the given URL.
|
425
|
+
"""
|
426
|
+
if self == DatastoreType.BIGQUERY:
|
427
|
+
raise ValueError(
|
428
|
+
f"No DataStore URL for BigQuery. Use a URL of the form: bigquery://<project_id>/<dataset>?param1=val1&..."
|
429
|
+
)
|
430
|
+
if self == DatastoreType.CLICKHOUSE:
|
431
|
+
raise ValueError(
|
432
|
+
f"No DataStore URL for ClickHouse. Use a URL of the form: <clickhouse_dsn>?table=<table_name>?param1=val1&..."
|
433
|
+
)
|
434
|
+
if self == DatastoreType.DATABRICKS_TABLES:
|
435
|
+
raise ValueError(
|
436
|
+
f"No DataStore URL for Databricks. Use a URL of the form: databricks://<warehouse_id>?param1=val1&..."
|
437
|
+
)
|
438
|
+
if self == DatastoreType.ICEBERG:
|
439
|
+
raise ValueError(
|
440
|
+
f"No DataStore URL for Iceberg. Use a URL of the form: iceberg://<table_identifier>?param1=val1&..."
|
441
|
+
)
|
442
|
+
if self == DatastoreType.MONGO:
|
443
|
+
raise ValueError(
|
444
|
+
f"No DataStore URL for MongoDB. Use a URL of the form: <mongodb_uri>?database=<db_name>&collection=<collection_name>&..."
|
445
|
+
)
|
446
|
+
if self in [
|
447
|
+
DatastoreType.DELTACAT,
|
448
|
+
DatastoreType.DELTACAT_NAMESPACE,
|
449
|
+
DatastoreType.DELTACAT_TABLE,
|
450
|
+
DatastoreType.DELTACAT_TABLE_VERSION,
|
451
|
+
DatastoreType.DELTACAT_STREAM,
|
452
|
+
DatastoreType.DELTACAT_PARTITION,
|
453
|
+
DatastoreType.DELTACAT_DELTA,
|
454
|
+
]:
|
455
|
+
raise ValueError(
|
456
|
+
f"No DataStore URL for DeltaCAT. Use a URL of the form: dc://<catalog>/[namespace]/[table]/[tableversion]/[stream]/[partition]/[delta]"
|
457
|
+
)
|
458
|
+
return f"{self.value}+{url}"
|
459
|
+
|
460
|
+
@staticmethod
|
461
|
+
def from_url(url: str) -> DatastoreType:
|
462
|
+
"""
|
463
|
+
Returns an inferred DatastoreType for the given URL.
|
464
|
+
|
465
|
+
Args:
|
466
|
+
url: The URL or file path to analyze for datastore type inference.
|
467
|
+
|
468
|
+
Returns:
|
469
|
+
An inferred DatastoreType for the given URL.
|
470
|
+
|
471
|
+
Raises:
|
472
|
+
ValueError: If a DatastoreType cannot be inferred from the given URL.
|
473
|
+
"""
|
474
|
+
# Detect by prefix first
|
475
|
+
# DeltaCAT URLs
|
476
|
+
if url.startswith("dc://"):
|
477
|
+
return DatastoreType.DELTACAT
|
478
|
+
|
479
|
+
# External Datastore Types
|
480
|
+
if url.startswith("hudi+") or url.startswith("hudi://"):
|
481
|
+
return DatastoreType.HUDI
|
482
|
+
if url.startswith("iceberg+") or url.startswith("iceberg://"):
|
483
|
+
return DatastoreType.ICEBERG
|
484
|
+
if url.startswith("deltalake+") or url.startswith("deltalake://"):
|
485
|
+
return DatastoreType.DELTA_LAKE
|
486
|
+
if url.startswith("deltasharing+") or url.startswith("deltasharing://"):
|
487
|
+
return DatastoreType.DELTA_SHARING
|
488
|
+
if url.startswith("bigquery+") or url.startswith("bigquery://"):
|
489
|
+
return DatastoreType.BIGQUERY
|
490
|
+
if url.startswith("clickhouse+") or url.startswith("clickhouse://"):
|
491
|
+
return DatastoreType.CLICKHOUSE
|
492
|
+
if url.startswith("databricks+") or url.startswith("databricks://"):
|
493
|
+
return DatastoreType.DATABRICKS_TABLES
|
494
|
+
if url.startswith("mongodb+") or url.startswith("mongodb://"):
|
495
|
+
return DatastoreType.MONGO
|
496
|
+
|
497
|
+
# File Format Types
|
498
|
+
if url.startswith("binary+") or url.startswith("binary://"):
|
499
|
+
return DatastoreType.BINARY
|
500
|
+
if url.startswith("csv+") or url.startswith("csv://"):
|
501
|
+
return DatastoreType.CSV
|
502
|
+
if url.startswith("json+") or url.startswith("json://"):
|
503
|
+
return DatastoreType.JSON
|
504
|
+
if url.startswith("avro+") or url.startswith("avro://"):
|
505
|
+
return DatastoreType.AVRO
|
506
|
+
if url.startswith("orc+") or url.startswith("orc://"):
|
507
|
+
return DatastoreType.ORC
|
508
|
+
if url.startswith("feather+") or url.startswith("feather://"):
|
509
|
+
return DatastoreType.FEATHER
|
510
|
+
if url.startswith("numpy+") or url.startswith("numpy://"):
|
511
|
+
return DatastoreType.NUMPY
|
512
|
+
if url.startswith("parquet+") or url.startswith("parquet://"):
|
513
|
+
return DatastoreType.PARQUET
|
514
|
+
if url.startswith("hdf+") or url.startswith("hdf://"):
|
515
|
+
return DatastoreType.HDF
|
516
|
+
if url.startswith("lance+") or url.startswith("lance://"):
|
517
|
+
return DatastoreType.LANCE
|
518
|
+
if url.startswith("tfrecords+") or url.startswith("tfrecords://"):
|
519
|
+
return DatastoreType.TFRECORDS
|
520
|
+
if url.startswith("webdataset+") or url.startswith("webdataset://"):
|
521
|
+
return DatastoreType.WEBDATASET
|
522
|
+
|
523
|
+
# Text and Web Types
|
524
|
+
if url.startswith("text+") or url.startswith("text://"):
|
525
|
+
return DatastoreType.TEXT
|
526
|
+
if url.startswith("html+") or url.startswith("html://"):
|
527
|
+
return DatastoreType.HTML
|
528
|
+
if url.startswith("warc+") or url.startswith("warc://"):
|
529
|
+
return DatastoreType.WARC
|
530
|
+
if url.startswith("xml+") or url.startswith("xml://"):
|
531
|
+
return DatastoreType.XML
|
532
|
+
|
533
|
+
# Media Types
|
534
|
+
if url.startswith("audio+") or url.startswith("audio://"):
|
535
|
+
return DatastoreType.AUDIO
|
536
|
+
if url.startswith("images+") or url.startswith("images://"):
|
537
|
+
return DatastoreType.IMAGES
|
538
|
+
if url.startswith("videos+") or url.startswith("videos://"):
|
539
|
+
return DatastoreType.VIDEOS
|
540
|
+
|
541
|
+
extension = "." + url.split(".")[-1].lower()
|
542
|
+
|
543
|
+
# Fallback to file-extensions
|
544
|
+
if extension in [".parquet", ".pq"]:
|
545
|
+
return DatastoreType.PARQUET
|
546
|
+
if extension == ".csv":
|
547
|
+
return DatastoreType.CSV
|
548
|
+
if extension == ".json":
|
549
|
+
return DatastoreType.JSON
|
550
|
+
if extension == ".avro":
|
551
|
+
return DatastoreType.AVRO
|
552
|
+
if extension == ".orc":
|
553
|
+
return DatastoreType.ORC
|
554
|
+
if extension == ".feather":
|
555
|
+
return DatastoreType.FEATHER
|
556
|
+
if extension == ".npy":
|
557
|
+
return DatastoreType.NUMPY
|
558
|
+
|
559
|
+
# Text formats
|
560
|
+
if extension in [".txt", ".text", ".md"]:
|
561
|
+
return DatastoreType.TEXT
|
562
|
+
|
563
|
+
# Data science formats
|
564
|
+
if extension in [".hdf", ".h5", ".hdf5"]:
|
565
|
+
return DatastoreType.HDF
|
566
|
+
if extension == ".lance":
|
567
|
+
return DatastoreType.LANCE
|
568
|
+
if extension in [".tfrecords", ".tfrecord"]:
|
569
|
+
return DatastoreType.TFRECORDS
|
570
|
+
if extension == ".webdataset":
|
571
|
+
return DatastoreType.WEBDATASET
|
572
|
+
|
573
|
+
# Web formats
|
574
|
+
if extension in [".html", ".htm"]:
|
575
|
+
return DatastoreType.HTML
|
576
|
+
if extension == ".warc":
|
577
|
+
return DatastoreType.WARC
|
578
|
+
if extension == ".xml":
|
579
|
+
return DatastoreType.XML
|
580
|
+
|
581
|
+
# Binary formats
|
582
|
+
if extension in [".bin", ".exe", ".dll", ".so", ".dylib", ".a", ".lib"]:
|
583
|
+
return DatastoreType.BINARY
|
584
|
+
|
585
|
+
# Media formats - Images
|
586
|
+
if extension in [
|
587
|
+
".jpg",
|
588
|
+
".jpeg",
|
589
|
+
".png",
|
590
|
+
".gif",
|
591
|
+
".bmp",
|
592
|
+
".tiff",
|
593
|
+
".tif",
|
594
|
+
".ico",
|
595
|
+
".webp",
|
596
|
+
".svg",
|
597
|
+
".heic",
|
598
|
+
".heif",
|
599
|
+
".jp2",
|
600
|
+
".jfif",
|
601
|
+
".pjpeg",
|
602
|
+
".pjp",
|
603
|
+
]:
|
604
|
+
return DatastoreType.IMAGES
|
605
|
+
|
606
|
+
# Media formats - Videos
|
607
|
+
if extension in [
|
608
|
+
".mp4",
|
609
|
+
".mov",
|
610
|
+
".avi",
|
611
|
+
".mkv",
|
612
|
+
".webm",
|
613
|
+
".flv",
|
614
|
+
".wmv",
|
615
|
+
".m4v",
|
616
|
+
".3gp",
|
617
|
+
".3g2",
|
618
|
+
".f4v",
|
619
|
+
".asf",
|
620
|
+
".rm",
|
621
|
+
".rmvb",
|
622
|
+
".vob",
|
623
|
+
".ogv",
|
624
|
+
".drc",
|
625
|
+
".mng",
|
626
|
+
".qt",
|
627
|
+
".yuv",
|
628
|
+
".mpg",
|
629
|
+
".mpeg",
|
630
|
+
".m2v",
|
631
|
+
".m2ts",
|
632
|
+
".mts",
|
633
|
+
".ts",
|
634
|
+
]:
|
635
|
+
return DatastoreType.VIDEOS
|
636
|
+
|
637
|
+
# Media formats - Audio
|
638
|
+
if extension in [
|
639
|
+
".mp3",
|
640
|
+
".wav",
|
641
|
+
".ogg",
|
642
|
+
".flac",
|
643
|
+
".aac",
|
644
|
+
".m4a",
|
645
|
+
".m4b",
|
646
|
+
".m4p",
|
647
|
+
".wma",
|
648
|
+
".ra",
|
649
|
+
".amr",
|
650
|
+
".ape",
|
651
|
+
".au",
|
652
|
+
".gsm",
|
653
|
+
".dss",
|
654
|
+
".dvf",
|
655
|
+
".msv",
|
656
|
+
".opus",
|
657
|
+
".tta",
|
658
|
+
".voc",
|
659
|
+
".vox",
|
660
|
+
".wv",
|
661
|
+
".3ga",
|
662
|
+
".ac3",
|
663
|
+
".adt",
|
664
|
+
".adts",
|
665
|
+
]:
|
666
|
+
return DatastoreType.AUDIO
|
667
|
+
|
668
|
+
# Default to binary
|
669
|
+
return DatastoreType.BINARY
|
670
|
+
|
671
|
+
@staticmethod
|
672
|
+
def get_url(url: str) -> str:
|
673
|
+
"""
|
674
|
+
Returns a DeltaCAT URL string with an inferred datastore type for the given URL.
|
675
|
+
|
676
|
+
Args:
|
677
|
+
url: The URL or file path to analyze for datastore type inference.
|
678
|
+
|
679
|
+
Returns:
|
680
|
+
A DeltaCAT URL string for the inferred datastore type.
|
681
|
+
|
682
|
+
Raises:
|
683
|
+
ValueError: If a DeltaCAT URL cannot be inferred from the given URL.
|
684
|
+
"""
|
685
|
+
return DatastoreType.from_url(url).url(url)
|
deltacat/types/tables.py
CHANGED
@@ -690,7 +690,7 @@ def _convert_all(tables: List[LocalTable], conversion_fn: Callable, **kwargs):
|
|
690
690
|
if not tables: # Empty list
|
691
691
|
return pd.DataFrame()
|
692
692
|
|
693
|
-
# Convert list elements
|
693
|
+
# Convert list elements to the same type
|
694
694
|
all_tables = []
|
695
695
|
for i, table in enumerate(tables):
|
696
696
|
try:
|
@@ -699,15 +699,9 @@ def _convert_all(tables: List[LocalTable], conversion_fn: Callable, **kwargs):
|
|
699
699
|
except Exception as e:
|
700
700
|
raise ValueError(f"Failed to convert list element {i}: {e}") from e
|
701
701
|
|
702
|
-
# Concatenate with error handling
|
702
|
+
# Concatenate with error handling
|
703
703
|
try:
|
704
|
-
|
705
|
-
if all(isinstance(table, pa.Table) for table in all_tables):
|
706
|
-
# Use PyArrow concatenation for PyArrow tables
|
707
|
-
return pa.concat_tables(all_tables, promote_options="permissive")
|
708
|
-
else:
|
709
|
-
# Use pandas concatenation for other types
|
710
|
-
return pd.concat(all_tables, ignore_index=True, sort=False)
|
704
|
+
return concat_tables(all_tables, get_dataset_type(all_tables[0]))
|
711
705
|
except Exception as e:
|
712
706
|
raise ValueError(f"Failed to concatenate {len(all_tables)} tables: {e}") from e
|
713
707
|
|
@@ -879,7 +873,7 @@ def get_table_slicer(table: Union[LocalTable, DistributedDataset]) -> Callable:
|
|
879
873
|
return _get_table_function(table, TABLE_CLASS_TO_SLICER_FUNC, "slicer")
|
880
874
|
|
881
875
|
|
882
|
-
def get_dataset_type(dataset:
|
876
|
+
def get_dataset_type(dataset: Union[LocalTable, DistributedDataset]) -> DatasetType:
|
883
877
|
"""Get the DatasetType enum value for a given dataset object.
|
884
878
|
|
885
879
|
Args:
|
@@ -1382,7 +1376,7 @@ class UuidBlockWritePathProvider(FilenameProvider):
|
|
1382
1376
|
self.write_paths.append(write_path)
|
1383
1377
|
if block is not None:
|
1384
1378
|
self.blocks.append(block)
|
1385
|
-
return
|
1379
|
+
return filename
|
1386
1380
|
|
1387
1381
|
def __call__(
|
1388
1382
|
self,
|
deltacat/utils/pandas.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import csv
|
2
2
|
import logging
|
3
3
|
import math
|
4
|
+
import posixpath
|
4
5
|
import bz2
|
5
6
|
import gzip
|
6
7
|
from functools import partial
|
@@ -403,12 +404,18 @@ def slice_dataframe(
|
|
403
404
|
return dataframes
|
404
405
|
|
405
406
|
|
406
|
-
def concat_dataframes(
|
407
|
+
def concat_dataframes(
|
408
|
+
dataframes: List[pd.DataFrame],
|
409
|
+
axis: int = 0,
|
410
|
+
copy: bool = False,
|
411
|
+
ignore_index: bool = True,
|
412
|
+
**kwargs,
|
413
|
+
) -> Optional[pd.DataFrame]:
|
407
414
|
if dataframes is None or not len(dataframes):
|
408
415
|
return None
|
409
416
|
if len(dataframes) == 1:
|
410
417
|
return next(iter(dataframes))
|
411
|
-
return pd.concat(dataframes, axis=
|
418
|
+
return pd.concat(dataframes, axis=axis, copy=copy, ignore_index=ignore_index)
|
412
419
|
|
413
420
|
|
414
421
|
def append_column_to_dataframe(
|
@@ -807,5 +814,6 @@ def dataframe_to_file(
|
|
807
814
|
f"implemented. Known content types: "
|
808
815
|
f"{CONTENT_TYPE_TO_PD_WRITE_FUNC.keys}"
|
809
816
|
)
|
810
|
-
|
817
|
+
filename = block_path_provider(base_path)
|
818
|
+
path = posixpath.join(base_path, filename)
|
811
819
|
writer(dataframe, path, filesystem=filesystem, **writer_kwargs)
|
deltacat/utils/polars.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import posixpath
|
2
3
|
import bz2
|
3
4
|
import gzip
|
4
5
|
from functools import partial
|
@@ -274,7 +275,8 @@ def dataframe_to_file(
|
|
274
275
|
f"implemented. Known content types: "
|
275
276
|
f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys()}"
|
276
277
|
)
|
277
|
-
|
278
|
+
filename = block_path_provider(base_path)
|
279
|
+
path = posixpath.join(base_path, filename)
|
278
280
|
logger.debug(f"Writing table: {table} with kwargs: {writer_kwargs} to path: {path}")
|
279
281
|
writer(table, path, filesystem=filesystem, **writer_kwargs)
|
280
282
|
|
deltacat/utils/pyarrow.py
CHANGED
@@ -6,6 +6,7 @@ import bz2
|
|
6
6
|
import gzip
|
7
7
|
import io
|
8
8
|
import logging
|
9
|
+
import posixpath
|
9
10
|
from functools import partial
|
10
11
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Union, Tuple
|
11
12
|
from datetime import datetime, date
|
@@ -1027,7 +1028,8 @@ def table_to_file(
|
|
1027
1028
|
f"implemented. Known content types: "
|
1028
1029
|
f"{CONTENT_TYPE_TO_PA_WRITE_FUNC.keys}"
|
1029
1030
|
)
|
1030
|
-
|
1031
|
+
filename = block_path_provider(base_path)
|
1032
|
+
path = posixpath.join(base_path, filename)
|
1031
1033
|
writer_kwargs = content_type_to_writer_kwargs(content_type)
|
1032
1034
|
writer_kwargs.update(kwargs)
|
1033
1035
|
logger.debug(f"Writing table: {table} with kwargs: {writer_kwargs} to path: {path}")
|
@@ -1493,7 +1495,9 @@ def file_to_parquet(
|
|
1493
1495
|
|
1494
1496
|
|
1495
1497
|
def concat_tables(
|
1496
|
-
tables: List[Union[pa.Table, papq.ParquetFile]]
|
1498
|
+
tables: List[Union[pa.Table, papq.ParquetFile]],
|
1499
|
+
promote_options: Optional[str] = "permissive",
|
1500
|
+
**kwargs,
|
1497
1501
|
) -> Optional[Union[pa.Table, List[papq.ParquetFile]]]:
|
1498
1502
|
"""
|
1499
1503
|
Concatenate a list of PyArrow Tables or ParquetFiles.
|
@@ -1525,7 +1529,7 @@ def concat_tables(
|
|
1525
1529
|
else:
|
1526
1530
|
converted_tables.append(table)
|
1527
1531
|
|
1528
|
-
return pa.concat_tables(converted_tables)
|
1532
|
+
return pa.concat_tables(converted_tables, promote_options=promote_options, **kwargs)
|
1529
1533
|
|
1530
1534
|
|
1531
1535
|
def delta_manifest_to_table(
|