deltacat 2.0.0b12__py3-none-any.whl → 2.0.0.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1048,11 +1048,11 @@ class TestCopyOnWrite:
1048
1048
  "dataset_type",
1049
1049
  [
1050
1050
  DatasetType.PANDAS,
1051
- DatasetType.PYARROW, # Now supported with field tracking pipeline
1052
- DatasetType.POLARS, # Now supported with field tracking pipeline
1053
- DatasetType.DAFT, # Distributed dataset type - now supported with field tracking pipeline
1054
- DatasetType.RAY_DATASET, # Distributed dataset type - now supported with field tracking pipeline
1055
- DatasetType.NUMPY, # Now supported with from_pandas helper and field tracking pipeline
1051
+ DatasetType.PYARROW,
1052
+ DatasetType.POLARS,
1053
+ DatasetType.DAFT,
1054
+ DatasetType.RAY_DATASET,
1055
+ DatasetType.NUMPY,
1056
1056
  ],
1057
1057
  )
1058
1058
  def test_partial_upsert_all_dataset_types(self, dataset_type):
@@ -1114,8 +1114,14 @@ class TestCopyOnWrite:
1114
1114
  table=table_name,
1115
1115
  namespace=self.test_namespace,
1116
1116
  catalog=self.catalog_name,
1117
+ read_as=dataset_type,
1118
+ )
1119
+ table = dc.get_table(
1120
+ table_name,
1121
+ catalog=self.catalog_name,
1122
+ namespace=self.test_namespace,
1117
1123
  )
1118
- result_df = result.to_pandas()
1124
+ result_df = dc.to_pandas(result, schema=table.table_version.schema.arrow)
1119
1125
 
1120
1126
  # Verify results
1121
1127
  assert len(result_df) == 4, f"Should have 4 records ({dataset_type.value})"
@@ -1239,8 +1245,14 @@ class TestCopyOnWrite:
1239
1245
  table=table_name,
1240
1246
  namespace=self.test_namespace,
1241
1247
  catalog=self.catalog_name,
1248
+ read_as=dataset_type,
1249
+ )
1250
+ table = dc.get_table(
1251
+ table_name,
1252
+ catalog=self.catalog_name,
1253
+ namespace=self.test_namespace,
1242
1254
  )
1243
- result_df = result.to_pandas()
1255
+ result_df = dc.to_pandas(result, schema=table.table_version.schema.arrow)
1244
1256
 
1245
1257
  # Verify results
1246
1258
  assert len(result_df) == 4, f"Should have 4 records ({dataset_type.value})"
@@ -1401,6 +1413,185 @@ class TestCopyOnWrite:
1401
1413
 
1402
1414
  self._verify_dataframe_contents(result, expected_final_data)
1403
1415
 
1416
+ def test_schema_evolution_delta_manifest_schema_ids(self):
1417
+ """
1418
+ Test that delta manifest entries record correct schema IDs during schema evolution.
1419
+
1420
+ This test verifies the fix for the issue where MERGE operations with new columns
1421
+ were recording incorrect schema IDs in delta manifest entries, causing reads
1422
+ to use old schemas instead of evolved schemas.
1423
+ """
1424
+ from deltacat.storage.model.metafile import Metafile
1425
+ from deltacat.storage.model.delta import Delta
1426
+
1427
+ table_name = "test_schema_evolution_manifest_ids"
1428
+
1429
+ # Step 1: Create table with merge keys (initial schema)
1430
+ self._create_table_with_merge_keys(table_name)
1431
+
1432
+ # Step 2: Write initial data using PyArrow for an exact match with the declared schema
1433
+ # This ensures that schema evolution isn't triggered by the first write (which would
1434
+ # result in 2 schemas created by the first write instead of 1)
1435
+ initial_data = pa.table(
1436
+ {
1437
+ "id": pa.array([1, 2, 3], type=pa.int64()),
1438
+ "name": pa.array(["Alice", "Bob", "Charlie"], type=pa.string()),
1439
+ "age": pa.array([25, 30, 35], type=pa.int32()),
1440
+ "city": pa.array(["NYC", "LA", "Chicago"], type=pa.string()),
1441
+ }
1442
+ )
1443
+ dc.write_to_table(
1444
+ data=initial_data,
1445
+ table=table_name,
1446
+ namespace=self.test_namespace,
1447
+ mode=TableWriteMode.MERGE,
1448
+ content_type=ContentType.PARQUET,
1449
+ catalog=self.catalog_name,
1450
+ )
1451
+
1452
+ # Step 3: Write MERGE data with NEW COLUMNS (triggers schema evolution)
1453
+ merge_data = pa.table(
1454
+ {
1455
+ "id": pa.array([1, 2, 4], type=pa.int64()), # Update existing + add new
1456
+ "salary": pa.array(
1457
+ [50000, 60000, 55000], type=pa.int64()
1458
+ ), # NEW COLUMN
1459
+ "department": pa.array(
1460
+ ["Engineering", "Sales", "Marketing"], type=pa.string()
1461
+ ), # NEW COLUMN
1462
+ }
1463
+ )
1464
+
1465
+ dc.write_to_table(
1466
+ data=merge_data,
1467
+ table=table_name,
1468
+ namespace=self.test_namespace,
1469
+ mode=TableWriteMode.MERGE,
1470
+ content_type=ContentType.PARQUET,
1471
+ catalog=self.catalog_name,
1472
+ )
1473
+
1474
+ # Writing the same data again shouldn't trigger schema evolution
1475
+ dc.write_to_table(
1476
+ data=merge_data,
1477
+ table=table_name,
1478
+ namespace=self.test_namespace,
1479
+ mode=TableWriteMode.MERGE,
1480
+ content_type=ContentType.PARQUET,
1481
+ catalog=self.catalog_name,
1482
+ )
1483
+
1484
+ # Step 4: Get table definition to access schema evolution history
1485
+ table_def = dc.get_table(
1486
+ table=table_name,
1487
+ namespace=self.test_namespace,
1488
+ catalog=self.catalog_name,
1489
+ )
1490
+
1491
+ all_schemas = table_def.table_version.schemas
1492
+
1493
+ # Verify we have schema evolution (should have 2 schemas: original + evolved)
1494
+ assert (
1495
+ len(all_schemas) == 2
1496
+ ), f"Expected 2 schemas after evolution, got {len(all_schemas)}"
1497
+
1498
+ initial_schema = all_schemas[0] # Original schema
1499
+ evolved_schema = all_schemas[1] # Latest schema after evolution
1500
+
1501
+ initial_schema_id = initial_schema.id
1502
+ evolved_schema_id = evolved_schema.id
1503
+
1504
+ # Step 5: Extract schema IDs from delta manifest entries
1505
+ def extract_schema_ids_from_deltas(all_objects):
1506
+ """Extract schema IDs from Delta objects by parsing manifest entries."""
1507
+ schema_ids = []
1508
+ for obj in all_objects:
1509
+ obj_type = Metafile.get_class(obj)
1510
+ if obj_type == Delta:
1511
+ delta_obj = obj
1512
+ # Access manifest entries to get schema IDs
1513
+ if delta_obj.manifest:
1514
+ manifest = delta_obj.manifest
1515
+ if manifest.entries:
1516
+ for i, entry in enumerate(manifest.entries):
1517
+ # Extract schema ID from manifest entry
1518
+ if entry.meta and entry.meta.schema_id is not None:
1519
+ schema_id_value = entry.meta.schema_id
1520
+ schema_ids.append(schema_id_value)
1521
+ return schema_ids
1522
+
1523
+ # Use dc.list with recursive=True to find all objects for this table
1524
+ table_url = dc.DeltaCatUrl(
1525
+ f"dc://{self.catalog_name}/{self.test_namespace}/{table_name}"
1526
+ )
1527
+ all_objects = dc.list(table_url, recursive=True)
1528
+
1529
+ # Extract schema IDs from all delta manifest entries
1530
+ manifest_schema_ids = extract_schema_ids_from_deltas(all_objects)
1531
+
1532
+ # Step 6: Verify schema ID correctness
1533
+ # We should have exactly 4 manifest entries (1 from first write + 3 from second write + 0 from third write)
1534
+ assert (
1535
+ len(manifest_schema_ids) == 4
1536
+ ), f"Expected 4 manifest entries with schema IDs, got {len(manifest_schema_ids)}"
1537
+
1538
+ # Check if manifest schema IDs match table schema IDs
1539
+ table_schema_ids = {initial_schema_id, evolved_schema_id}
1540
+ manifest_schema_ids_set = set(manifest_schema_ids)
1541
+
1542
+ if table_schema_ids == manifest_schema_ids_set:
1543
+ # The first delta should use the initial schema ID
1544
+ initial_entries = [
1545
+ sid for sid in manifest_schema_ids if sid == initial_schema_id
1546
+ ]
1547
+ assert (
1548
+ len(initial_entries) == 1
1549
+ ), f"Expected 1 initial entry with schema ID {initial_schema_id}, but found {len(initial_entries)}"
1550
+
1551
+ # The second delta should use the evolved schema ID
1552
+ evolved_entries = [
1553
+ sid for sid in manifest_schema_ids if sid == evolved_schema_id
1554
+ ]
1555
+ assert (
1556
+ len(evolved_entries) == 3
1557
+ ), f"Expected 3 evolved entries with schema ID {evolved_schema_id}, but found {len(evolved_entries)}"
1558
+ else:
1559
+ # This should not happen with PyArrow tables - fail the test
1560
+ assert (
1561
+ False
1562
+ ), f"Schema IDs should match. Table: {sorted(table_schema_ids)}, Manifest: {sorted(manifest_schema_ids_set)}"
1563
+
1564
+ # Step 7: Verify the data can be read correctly with evolved schema
1565
+ final_data = dc.to_pandas(
1566
+ dc.read_table(
1567
+ table=table_name,
1568
+ namespace=self.test_namespace,
1569
+ catalog=self.catalog_name,
1570
+ )
1571
+ )
1572
+
1573
+ # Should have all original columns plus new columns
1574
+ expected_columns = {"id", "name", "age", "city", "salary", "department"}
1575
+ actual_columns = set(final_data.columns)
1576
+ assert expected_columns.issubset(
1577
+ actual_columns
1578
+ ), f"Missing columns: {expected_columns - actual_columns}"
1579
+
1580
+ # Verify data integrity - all records should have both old and new data
1581
+ assert (
1582
+ len(final_data) == 4
1583
+ ), f"Expected 4 records after merge, got {len(final_data)}"
1584
+
1585
+ # Check that evolved columns are properly populated
1586
+ salary_values = final_data["salary"].dropna()
1587
+ dept_values = final_data["department"].dropna()
1588
+ assert (
1589
+ len(salary_values) >= 3
1590
+ ), f"Expected salary values for at least 3 records, got {len(salary_values)}"
1591
+ assert (
1592
+ len(dept_values) >= 3
1593
+ ), f"Expected department values for at least 3 records, got {len(dept_values)}"
1594
+
1404
1595
  def test_append_delta_count_compaction(self):
1405
1596
  """Test that compaction is triggered by appended delta count for APPEND mode writes."""
1406
1597
  table_name = "test_append_delta_compaction"
deltacat/types/media.py CHANGED
@@ -1,3 +1,5 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
1
3
  from enum import Enum
2
4
  from typing import Set, Dict
3
5
 
@@ -401,3 +403,283 @@ class DatastoreType(str, Enum):
401
403
  WARC = "warc"
402
404
  WEBDATASET = "webdataset"
403
405
  XML = "xml"
406
+
407
+ def url(self, url: str) -> str:
408
+ """
409
+ Returns a DeltaCAT URL string for this datastore type and the given base URL.
410
+ Typically, DeltaCAT URLs will be of the form <DatastoreType>+<URL>.
411
+
412
+ However, the following Datastore Types don't follow the <DatastoreType>+<URL> convention:
413
+
414
+ {DatastoreType.MONGO}: <mongodb_uri>?database=<db_name>&collection=<collection_name>&...
415
+ {DatastoreType.BIGQUERY}: bigquery://<project_id>/<dataset>?param1=val1&...
416
+ {DatastoreType.CLICKHOUSE}: <clickhouse_dsn>?table=<table_name>?param1=val1&...
417
+ {DatastoreType.DATABRICKS_TABLES}: databricks://<warehouse_id>?param1=val1&...
418
+ {DatastoreType.ICEBERG}: iceberg://<table_identifier>?param1=val1&...
419
+
420
+ Args:
421
+ url: The base URL to convert to a DeltaCAT URL.
422
+
423
+ Returns:
424
+ A DeltaCAT URL string for this datastore type and the given URL.
425
+ """
426
+ if self == DatastoreType.BIGQUERY:
427
+ raise ValueError(
428
+ f"No DataStore URL for BigQuery. Use a URL of the form: bigquery://<project_id>/<dataset>?param1=val1&..."
429
+ )
430
+ if self == DatastoreType.CLICKHOUSE:
431
+ raise ValueError(
432
+ f"No DataStore URL for ClickHouse. Use a URL of the form: <clickhouse_dsn>?table=<table_name>?param1=val1&..."
433
+ )
434
+ if self == DatastoreType.DATABRICKS_TABLES:
435
+ raise ValueError(
436
+ f"No DataStore URL for Databricks. Use a URL of the form: databricks://<warehouse_id>?param1=val1&..."
437
+ )
438
+ if self == DatastoreType.ICEBERG:
439
+ raise ValueError(
440
+ f"No DataStore URL for Iceberg. Use a URL of the form: iceberg://<table_identifier>?param1=val1&..."
441
+ )
442
+ if self == DatastoreType.MONGO:
443
+ raise ValueError(
444
+ f"No DataStore URL for MongoDB. Use a URL of the form: <mongodb_uri>?database=<db_name>&collection=<collection_name>&..."
445
+ )
446
+ if self in [
447
+ DatastoreType.DELTACAT,
448
+ DatastoreType.DELTACAT_NAMESPACE,
449
+ DatastoreType.DELTACAT_TABLE,
450
+ DatastoreType.DELTACAT_TABLE_VERSION,
451
+ DatastoreType.DELTACAT_STREAM,
452
+ DatastoreType.DELTACAT_PARTITION,
453
+ DatastoreType.DELTACAT_DELTA,
454
+ ]:
455
+ raise ValueError(
456
+ f"No DataStore URL for DeltaCAT. Use a URL of the form: dc://<catalog>/[namespace]/[table]/[tableversion]/[stream]/[partition]/[delta]"
457
+ )
458
+ return f"{self.value}+{url}"
459
+
460
+ @staticmethod
461
+ def from_url(url: str) -> DatastoreType:
462
+ """
463
+ Returns an inferred DatastoreType for the given URL.
464
+
465
+ Args:
466
+ url: The URL or file path to analyze for datastore type inference.
467
+
468
+ Returns:
469
+ An inferred DatastoreType for the given URL.
470
+
471
+ Raises:
472
+ ValueError: If a DatastoreType cannot be inferred from the given URL.
473
+ """
474
+ # Detect by prefix first
475
+ # DeltaCAT URLs
476
+ if url.startswith("dc://"):
477
+ return DatastoreType.DELTACAT
478
+
479
+ # External Datastore Types
480
+ if url.startswith("hudi+") or url.startswith("hudi://"):
481
+ return DatastoreType.HUDI
482
+ if url.startswith("iceberg+") or url.startswith("iceberg://"):
483
+ return DatastoreType.ICEBERG
484
+ if url.startswith("deltalake+") or url.startswith("deltalake://"):
485
+ return DatastoreType.DELTA_LAKE
486
+ if url.startswith("deltasharing+") or url.startswith("deltasharing://"):
487
+ return DatastoreType.DELTA_SHARING
488
+ if url.startswith("bigquery+") or url.startswith("bigquery://"):
489
+ return DatastoreType.BIGQUERY
490
+ if url.startswith("clickhouse+") or url.startswith("clickhouse://"):
491
+ return DatastoreType.CLICKHOUSE
492
+ if url.startswith("databricks+") or url.startswith("databricks://"):
493
+ return DatastoreType.DATABRICKS_TABLES
494
+ if url.startswith("mongodb+") or url.startswith("mongodb://"):
495
+ return DatastoreType.MONGO
496
+
497
+ # File Format Types
498
+ if url.startswith("binary+") or url.startswith("binary://"):
499
+ return DatastoreType.BINARY
500
+ if url.startswith("csv+") or url.startswith("csv://"):
501
+ return DatastoreType.CSV
502
+ if url.startswith("json+") or url.startswith("json://"):
503
+ return DatastoreType.JSON
504
+ if url.startswith("avro+") or url.startswith("avro://"):
505
+ return DatastoreType.AVRO
506
+ if url.startswith("orc+") or url.startswith("orc://"):
507
+ return DatastoreType.ORC
508
+ if url.startswith("feather+") or url.startswith("feather://"):
509
+ return DatastoreType.FEATHER
510
+ if url.startswith("numpy+") or url.startswith("numpy://"):
511
+ return DatastoreType.NUMPY
512
+ if url.startswith("parquet+") or url.startswith("parquet://"):
513
+ return DatastoreType.PARQUET
514
+ if url.startswith("hdf+") or url.startswith("hdf://"):
515
+ return DatastoreType.HDF
516
+ if url.startswith("lance+") or url.startswith("lance://"):
517
+ return DatastoreType.LANCE
518
+ if url.startswith("tfrecords+") or url.startswith("tfrecords://"):
519
+ return DatastoreType.TFRECORDS
520
+ if url.startswith("webdataset+") or url.startswith("webdataset://"):
521
+ return DatastoreType.WEBDATASET
522
+
523
+ # Text and Web Types
524
+ if url.startswith("text+") or url.startswith("text://"):
525
+ return DatastoreType.TEXT
526
+ if url.startswith("html+") or url.startswith("html://"):
527
+ return DatastoreType.HTML
528
+ if url.startswith("warc+") or url.startswith("warc://"):
529
+ return DatastoreType.WARC
530
+ if url.startswith("xml+") or url.startswith("xml://"):
531
+ return DatastoreType.XML
532
+
533
+ # Media Types
534
+ if url.startswith("audio+") or url.startswith("audio://"):
535
+ return DatastoreType.AUDIO
536
+ if url.startswith("images+") or url.startswith("images://"):
537
+ return DatastoreType.IMAGES
538
+ if url.startswith("videos+") or url.startswith("videos://"):
539
+ return DatastoreType.VIDEOS
540
+
541
+ extension = "." + url.split(".")[-1].lower()
542
+
543
+ # Fallback to file-extensions
544
+ if extension in [".parquet", ".pq"]:
545
+ return DatastoreType.PARQUET
546
+ if extension == ".csv":
547
+ return DatastoreType.CSV
548
+ if extension == ".json":
549
+ return DatastoreType.JSON
550
+ if extension == ".avro":
551
+ return DatastoreType.AVRO
552
+ if extension == ".orc":
553
+ return DatastoreType.ORC
554
+ if extension == ".feather":
555
+ return DatastoreType.FEATHER
556
+ if extension == ".npy":
557
+ return DatastoreType.NUMPY
558
+
559
+ # Text formats
560
+ if extension in [".txt", ".text", ".md"]:
561
+ return DatastoreType.TEXT
562
+
563
+ # Data science formats
564
+ if extension in [".hdf", ".h5", ".hdf5"]:
565
+ return DatastoreType.HDF
566
+ if extension == ".lance":
567
+ return DatastoreType.LANCE
568
+ if extension in [".tfrecords", ".tfrecord"]:
569
+ return DatastoreType.TFRECORDS
570
+ if extension == ".webdataset":
571
+ return DatastoreType.WEBDATASET
572
+
573
+ # Web formats
574
+ if extension in [".html", ".htm"]:
575
+ return DatastoreType.HTML
576
+ if extension == ".warc":
577
+ return DatastoreType.WARC
578
+ if extension == ".xml":
579
+ return DatastoreType.XML
580
+
581
+ # Binary formats
582
+ if extension in [".bin", ".exe", ".dll", ".so", ".dylib", ".a", ".lib"]:
583
+ return DatastoreType.BINARY
584
+
585
+ # Media formats - Images
586
+ if extension in [
587
+ ".jpg",
588
+ ".jpeg",
589
+ ".png",
590
+ ".gif",
591
+ ".bmp",
592
+ ".tiff",
593
+ ".tif",
594
+ ".ico",
595
+ ".webp",
596
+ ".svg",
597
+ ".heic",
598
+ ".heif",
599
+ ".jp2",
600
+ ".jfif",
601
+ ".pjpeg",
602
+ ".pjp",
603
+ ]:
604
+ return DatastoreType.IMAGES
605
+
606
+ # Media formats - Videos
607
+ if extension in [
608
+ ".mp4",
609
+ ".mov",
610
+ ".avi",
611
+ ".mkv",
612
+ ".webm",
613
+ ".flv",
614
+ ".wmv",
615
+ ".m4v",
616
+ ".3gp",
617
+ ".3g2",
618
+ ".f4v",
619
+ ".asf",
620
+ ".rm",
621
+ ".rmvb",
622
+ ".vob",
623
+ ".ogv",
624
+ ".drc",
625
+ ".mng",
626
+ ".qt",
627
+ ".yuv",
628
+ ".mpg",
629
+ ".mpeg",
630
+ ".m2v",
631
+ ".m2ts",
632
+ ".mts",
633
+ ".ts",
634
+ ]:
635
+ return DatastoreType.VIDEOS
636
+
637
+ # Media formats - Audio
638
+ if extension in [
639
+ ".mp3",
640
+ ".wav",
641
+ ".ogg",
642
+ ".flac",
643
+ ".aac",
644
+ ".m4a",
645
+ ".m4b",
646
+ ".m4p",
647
+ ".wma",
648
+ ".ra",
649
+ ".amr",
650
+ ".ape",
651
+ ".au",
652
+ ".gsm",
653
+ ".dss",
654
+ ".dvf",
655
+ ".msv",
656
+ ".opus",
657
+ ".tta",
658
+ ".voc",
659
+ ".vox",
660
+ ".wv",
661
+ ".3ga",
662
+ ".ac3",
663
+ ".adt",
664
+ ".adts",
665
+ ]:
666
+ return DatastoreType.AUDIO
667
+
668
+ # Default to binary
669
+ return DatastoreType.BINARY
670
+
671
+ @staticmethod
672
+ def get_url(url: str) -> str:
673
+ """
674
+ Returns a DeltaCAT URL string with an inferred datastore type for the given URL.
675
+
676
+ Args:
677
+ url: The URL or file path to analyze for datastore type inference.
678
+
679
+ Returns:
680
+ A DeltaCAT URL string for the inferred datastore type.
681
+
682
+ Raises:
683
+ ValueError: If a DeltaCAT URL cannot be inferred from the given URL.
684
+ """
685
+ return DatastoreType.from_url(url).url(url)
deltacat/types/tables.py CHANGED
@@ -690,7 +690,7 @@ def _convert_all(tables: List[LocalTable], conversion_fn: Callable, **kwargs):
690
690
  if not tables: # Empty list
691
691
  return pd.DataFrame()
692
692
 
693
- # Convert list elements
693
+ # Convert list elements to the same type
694
694
  all_tables = []
695
695
  for i, table in enumerate(tables):
696
696
  try:
@@ -699,15 +699,9 @@ def _convert_all(tables: List[LocalTable], conversion_fn: Callable, **kwargs):
699
699
  except Exception as e:
700
700
  raise ValueError(f"Failed to convert list element {i}: {e}") from e
701
701
 
702
- # Concatenate with error handling - handle different table types
702
+ # Concatenate with error handling
703
703
  try:
704
- # Check if we have PyArrow tables
705
- if all(isinstance(table, pa.Table) for table in all_tables):
706
- # Use PyArrow concatenation for PyArrow tables
707
- return pa.concat_tables(all_tables, promote_options="permissive")
708
- else:
709
- # Use pandas concatenation for other types
710
- return pd.concat(all_tables, ignore_index=True, sort=False)
704
+ return concat_tables(all_tables, get_dataset_type(all_tables[0]))
711
705
  except Exception as e:
712
706
  raise ValueError(f"Failed to concatenate {len(all_tables)} tables: {e}") from e
713
707
 
@@ -879,7 +873,7 @@ def get_table_slicer(table: Union[LocalTable, DistributedDataset]) -> Callable:
879
873
  return _get_table_function(table, TABLE_CLASS_TO_SLICER_FUNC, "slicer")
880
874
 
881
875
 
882
- def get_dataset_type(dataset: Dataset) -> DatasetType:
876
+ def get_dataset_type(dataset: Union[LocalTable, DistributedDataset]) -> DatasetType:
883
877
  """Get the DatasetType enum value for a given dataset object.
884
878
 
885
879
  Args:
@@ -1382,7 +1376,7 @@ class UuidBlockWritePathProvider(FilenameProvider):
1382
1376
  self.write_paths.append(write_path)
1383
1377
  if block is not None:
1384
1378
  self.blocks.append(block)
1385
- return write_path
1379
+ return filename
1386
1380
 
1387
1381
  def __call__(
1388
1382
  self,
deltacat/utils/pandas.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import csv
2
2
  import logging
3
3
  import math
4
+ import posixpath
4
5
  import bz2
5
6
  import gzip
6
7
  from functools import partial
@@ -403,12 +404,18 @@ def slice_dataframe(
403
404
  return dataframes
404
405
 
405
406
 
406
- def concat_dataframes(dataframes: List[pd.DataFrame]) -> Optional[pd.DataFrame]:
407
+ def concat_dataframes(
408
+ dataframes: List[pd.DataFrame],
409
+ axis: int = 0,
410
+ copy: bool = False,
411
+ ignore_index: bool = True,
412
+ **kwargs,
413
+ ) -> Optional[pd.DataFrame]:
407
414
  if dataframes is None or not len(dataframes):
408
415
  return None
409
416
  if len(dataframes) == 1:
410
417
  return next(iter(dataframes))
411
- return pd.concat(dataframes, axis=0, copy=False)
418
+ return pd.concat(dataframes, axis=axis, copy=copy, ignore_index=ignore_index)
412
419
 
413
420
 
414
421
  def append_column_to_dataframe(
@@ -807,5 +814,6 @@ def dataframe_to_file(
807
814
  f"implemented. Known content types: "
808
815
  f"{CONTENT_TYPE_TO_PD_WRITE_FUNC.keys}"
809
816
  )
810
- path = block_path_provider(base_path)
817
+ filename = block_path_provider(base_path)
818
+ path = posixpath.join(base_path, filename)
811
819
  writer(dataframe, path, filesystem=filesystem, **writer_kwargs)
deltacat/utils/polars.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import posixpath
2
3
  import bz2
3
4
  import gzip
4
5
  from functools import partial
@@ -274,7 +275,8 @@ def dataframe_to_file(
274
275
  f"implemented. Known content types: "
275
276
  f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys()}"
276
277
  )
277
- path = block_path_provider(base_path)
278
+ filename = block_path_provider(base_path)
279
+ path = posixpath.join(base_path, filename)
278
280
  logger.debug(f"Writing table: {table} with kwargs: {writer_kwargs} to path: {path}")
279
281
  writer(table, path, filesystem=filesystem, **writer_kwargs)
280
282
 
deltacat/utils/pyarrow.py CHANGED
@@ -6,6 +6,7 @@ import bz2
6
6
  import gzip
7
7
  import io
8
8
  import logging
9
+ import posixpath
9
10
  from functools import partial
10
11
  from typing import Any, Callable, Dict, Iterable, List, Optional, Union, Tuple
11
12
  from datetime import datetime, date
@@ -1027,7 +1028,8 @@ def table_to_file(
1027
1028
  f"implemented. Known content types: "
1028
1029
  f"{CONTENT_TYPE_TO_PA_WRITE_FUNC.keys}"
1029
1030
  )
1030
- path = block_path_provider(base_path)
1031
+ filename = block_path_provider(base_path)
1032
+ path = posixpath.join(base_path, filename)
1031
1033
  writer_kwargs = content_type_to_writer_kwargs(content_type)
1032
1034
  writer_kwargs.update(kwargs)
1033
1035
  logger.debug(f"Writing table: {table} with kwargs: {writer_kwargs} to path: {path}")
@@ -1493,7 +1495,9 @@ def file_to_parquet(
1493
1495
 
1494
1496
 
1495
1497
  def concat_tables(
1496
- tables: List[Union[pa.Table, papq.ParquetFile]]
1498
+ tables: List[Union[pa.Table, papq.ParquetFile]],
1499
+ promote_options: Optional[str] = "permissive",
1500
+ **kwargs,
1497
1501
  ) -> Optional[Union[pa.Table, List[papq.ParquetFile]]]:
1498
1502
  """
1499
1503
  Concatenate a list of PyArrow Tables or ParquetFiles.
@@ -1525,7 +1529,7 @@ def concat_tables(
1525
1529
  else:
1526
1530
  converted_tables.append(table)
1527
1531
 
1528
- return pa.concat_tables(converted_tables)
1532
+ return pa.concat_tables(converted_tables, promote_options=promote_options, **kwargs)
1529
1533
 
1530
1534
 
1531
1535
  def delta_manifest_to_table(