deltacat 2.0.0.post1__py3-none-any.whl → 2.0.0.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -122,7 +122,7 @@ if importlib.util.find_spec("pyiceberg") is not None:
122
122
 
123
123
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
124
124
 
125
- __version__ = "2.0.0.post1"
125
+ __version__ = "2.0.0.post2"
126
126
 
127
127
 
128
128
  __all__ = [
deltacat/api.py CHANGED
@@ -28,7 +28,10 @@ from deltacat.storage import (
28
28
  LocalTable,
29
29
  Metafile,
30
30
  )
31
- from deltacat.types.media import DatasetType
31
+ from deltacat.types.media import (
32
+ DatasetType,
33
+ DatastoreType,
34
+ )
32
35
  from deltacat.utils.url import (
33
36
  DeltaCatUrl,
34
37
  DeltaCatUrlReader,
@@ -83,8 +86,8 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
83
86
 
84
87
 
85
88
  def copy(
86
- src: DeltaCatUrl,
87
- dst: DeltaCatUrl,
89
+ src: Union[DeltaCatUrl, str],
90
+ dst: Union[DeltaCatUrl, str],
88
91
  *,
89
92
  transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
90
93
  extension_to_memory_multiplier: Dict[str, float] = {
@@ -153,6 +156,8 @@ def copy(
153
156
  Returns:
154
157
  None
155
158
  """
159
+ src = _resolve_url(src)
160
+ dst = _resolve_url(dst)
156
161
  if src.is_deltacat_catalog_url() or dst.is_deltacat_catalog_url():
157
162
  return _copy_dc(src, dst, recursive=src.url.endswith("/**"))
158
163
  else:
@@ -305,12 +310,13 @@ class CustomReadKwargsProvider(ReadKwargsProvider):
305
310
 
306
311
 
307
312
  def list(
308
- url: DeltaCatUrl,
313
+ url: Union[DeltaCatUrl, str],
309
314
  *,
310
315
  recursive: bool = False,
311
316
  dataset_type: Optional[DatasetType] = None,
312
317
  **kwargs,
313
318
  ) -> Union[List[Metafile], LocalTable, DistributedDataset]:
319
+ url = _resolve_url(url)
314
320
  if not url.is_deltacat_catalog_url():
315
321
  raise NotImplementedError("List only supports DeltaCAT Catalog URLs.")
316
322
  if dataset_type in DatasetType.distributed():
@@ -345,21 +351,52 @@ def list(
345
351
  )
346
352
 
347
353
 
354
+ def _resolve_url(url: Union[DeltaCatUrl, str]) -> DeltaCatUrl:
355
+ if isinstance(url, str):
356
+ try:
357
+ url = DeltaCatUrl(url)
358
+ except ValueError:
359
+ url = DatastoreType.get_url(url)
360
+ url = DeltaCatUrl(url)
361
+ return url
362
+
363
+
348
364
  def get(
349
- url,
365
+ url: Union[DeltaCatUrl, str],
366
+ read_as: DatasetType = DatasetType.RAY_DATASET,
350
367
  *args,
351
368
  **kwargs,
352
369
  ) -> Union[Metafile, Dataset]:
353
- reader = DeltaCatUrlReader(url)
370
+ """
371
+ Reads a DeltaCAT URL into a Metafile or Dataset. DeltaCAT URLs can either
372
+ reference objects registered in a DeltaCAT catalog, or unregistered external
373
+ objects that are readable into a Dataset. DeltaCAT automatically infers the right
374
+ Ray Data reader for the URL. If the URL is an unregistered external object,
375
+ the reader will be inferred from the URL's datastore type.
376
+
377
+ Args:
378
+ url: The DeltaCAT URL to read.
379
+ read_as: The DatasetType to read an unregistered external object as. Ignored for
380
+ registered DeltaCAT objects. Defaults to DatasetType.RAY_DATASET.
381
+ args: Additional arguments to pass to the reader.
382
+ kwargs: Additional keyword arguments to pass to the reader.
383
+
384
+ Returns:
385
+ A Metafile for registered DeltaCAT URLs or a Dataset containing the
386
+ data from the URL.
387
+ """
388
+ url = _resolve_url(url)
389
+ reader = DeltaCatUrlReader(url, dataset_type=read_as)
354
390
  return reader.read(*args, **kwargs)
355
391
 
356
392
 
357
393
  def put(
358
- url: DeltaCatUrl,
394
+ url: Union[DeltaCatUrl, str],
359
395
  metafile: Optional[Metafile] = None,
360
396
  *args,
361
397
  **kwargs,
362
398
  ) -> Union[Metafile, str]:
399
+ url = _resolve_url(url)
363
400
  writer = DeltaCatUrlWriter(url, metafile=metafile)
364
401
  return writer.write(*args, **kwargs)
365
402
 
@@ -446,7 +446,9 @@ def write_to_table(
446
446
  "transaction": write_transaction, # Pass transaction to update_table_version
447
447
  }
448
448
 
449
- _get_storage(**catalog_kwargs).update_table_version(
449
+ _, updated_table_version_obj, _ = _get_storage(
450
+ **catalog_kwargs
451
+ ).update_table_version(
450
452
  namespace=namespace,
451
453
  table_name=table,
452
454
  table_version=table_version_obj.table_version,
@@ -465,9 +467,9 @@ def write_to_table(
465
467
  content_type,
466
468
  commit_staged_partition,
467
469
  table_version_obj,
470
+ updated_table_version_obj if schema_modified else None,
468
471
  namespace,
469
472
  table,
470
- schema=updated_schema if schema_modified else table_version_obj.schema,
471
473
  original_fields=original_fields,
472
474
  **filtered_kwargs,
473
475
  )
@@ -743,61 +745,6 @@ def _convert_numpy_for_schema_validation(
743
745
  )
744
746
 
745
747
 
746
- def _build_entry_index_to_schema_mapping(
747
- qualified_deltas: List[Delta], table_version_obj, **kwargs
748
- ) -> List[Schema]:
749
- """Build a mapping from manifest entry index to schema for reading operations.
750
-
751
- Args:
752
- qualified_deltas: List of deltas to process
753
- table_version_obj: Table version containing schemas
754
- **kwargs: Additional arguments passed to storage operations
755
-
756
- Returns:
757
- List mapping each manifest entry index to its corresponding schema
758
-
759
- Raises:
760
- ValueError: If a manifest's schema ID is not found in table version schemas
761
- """
762
- entry_index_to_schema = []
763
- for delta in qualified_deltas:
764
- if delta.manifest:
765
- manifest = delta.manifest
766
- else:
767
- # Fetch manifest from storage
768
- manifest = _get_storage(**kwargs).get_delta_manifest(
769
- delta.locator,
770
- **kwargs,
771
- )
772
- # Map manifest entry index to schema ID
773
- schema_id = manifest.meta.schema_id
774
-
775
- # Find the schema that matches this manifest's schema_id
776
- matching_schema = None
777
- if table_version_obj.schemas:
778
- for schema in table_version_obj.schemas:
779
- if schema.id == schema_id:
780
- matching_schema = schema
781
- break
782
-
783
- if matching_schema is None:
784
- available_schema_ids = (
785
- [s.id for s in table_version_obj.schemas]
786
- if table_version_obj.schemas
787
- else []
788
- )
789
- raise ValueError(
790
- f"Manifest schema ID {schema_id} not found in table version schemas. "
791
- f"Available schema IDs: {available_schema_ids}. "
792
- )
793
-
794
- # Add the matching schema for each entry in this manifest
795
- for _ in range(len(manifest.entries)):
796
- entry_index_to_schema.append(matching_schema)
797
-
798
- return entry_index_to_schema
799
-
800
-
801
748
  def _convert_data_if_needed(data: Dataset) -> Dataset:
802
749
  """Convert unsupported data types to supported ones."""
803
750
  if isinstance(data, daft.DataFrame):
@@ -950,10 +897,10 @@ def _stage_commit_and_compact(
950
897
  delta_type: DeltaType,
951
898
  content_type: ContentType,
952
899
  commit_staged_partition: bool,
953
- table_version_obj: TableVersion,
900
+ original_table_version_obj: TableVersion,
901
+ updated_table_version_obj: Optional[TableVersion],
954
902
  namespace: str,
955
903
  table: str,
956
- schema: Schema,
957
904
  original_fields: Set[str],
958
905
  **kwargs,
959
906
  ) -> None:
@@ -962,6 +909,12 @@ def _stage_commit_and_compact(
962
909
  # We explicitly pass the correct schema parameter
963
910
  kwargs.pop("schema", None)
964
911
 
912
+ resolved_table_version_obj = (
913
+ updated_table_version_obj
914
+ if updated_table_version_obj
915
+ else original_table_version_obj
916
+ )
917
+
965
918
  # Stage a delta with the data
966
919
  delta = _get_storage(**kwargs).stage_delta(
967
920
  data=converted_data,
@@ -971,7 +924,7 @@ def _stage_commit_and_compact(
971
924
  author=ManifestAuthor.of(
972
925
  name="deltacat.write_to_table", version=dc.__version__
973
926
  ),
974
- schema=schema,
927
+ schema=resolved_table_version_obj.schema,
975
928
  **kwargs,
976
929
  )
977
930
 
@@ -982,25 +935,26 @@ def _stage_commit_and_compact(
982
935
 
983
936
  # Check compaction trigger decision
984
937
  should_compact = _trigger_compaction(
985
- table_version_obj,
938
+ resolved_table_version_obj,
986
939
  delta,
987
940
  TableReadOptimizationLevel.MAX,
988
941
  **kwargs,
989
942
  )
990
943
  if should_compact:
991
944
  # Run V2 compaction session to merge or delete data
992
- if table_version_obj.schema:
993
- all_column_names = table_version_obj.schema.arrow.names
994
- else:
945
+ if not original_table_version_obj.schema:
995
946
  raise RuntimeError("Table version schema is required to run compaction.")
947
+ original_table_version_column_names = (
948
+ original_table_version_obj.schema.arrow.names
949
+ )
996
950
  _run_compaction_session(
997
- table_version_obj=table_version_obj,
951
+ table_version_obj=resolved_table_version_obj,
998
952
  partition=partition,
999
953
  latest_delta_stream_position=delta.stream_position,
1000
954
  namespace=namespace,
1001
955
  table=table,
1002
956
  original_fields=original_fields,
1003
- all_column_names=all_column_names,
957
+ original_table_version_column_names=original_table_version_column_names,
1004
958
  **kwargs,
1005
959
  )
1006
960
 
@@ -1232,7 +1186,7 @@ def _run_compaction_session(
1232
1186
  namespace: str,
1233
1187
  table: str,
1234
1188
  original_fields: Set[str],
1235
- all_column_names: List[str],
1189
+ original_table_version_column_names: List[str],
1236
1190
  **kwargs,
1237
1191
  ) -> None:
1238
1192
  """
@@ -1254,7 +1208,8 @@ def _run_compaction_session(
1254
1208
  # Extract compaction configuration
1255
1209
  primary_keys = _get_compaction_primary_keys(table_version_obj)
1256
1210
  hash_bucket_count = _get_compaction_hash_bucket_count(
1257
- partition, table_version_obj
1211
+ partition,
1212
+ table_version_obj,
1258
1213
  )
1259
1214
 
1260
1215
  # Create compaction parameters
@@ -1265,7 +1220,7 @@ def _run_compaction_session(
1265
1220
  primary_keys,
1266
1221
  hash_bucket_count,
1267
1222
  original_fields=original_fields,
1268
- all_column_names=all_column_names,
1223
+ all_column_names=original_table_version_column_names,
1269
1224
  **kwargs,
1270
1225
  )
1271
1226
 
@@ -1499,10 +1454,6 @@ def _download_and_process_table_data(
1499
1454
  return _convert_pandas_to_numpy(result)
1500
1455
  return result
1501
1456
 
1502
- # Get schemas for each manifest entry
1503
- entry_index_to_schema = _build_entry_index_to_schema_mapping(
1504
- qualified_deltas, table_version_obj, **kwargs
1505
- )
1506
1457
  # Standard non-empty schema table read path - merge deltas and download data
1507
1458
  merged_delta = Delta.merge_deltas(qualified_deltas)
1508
1459
 
@@ -1570,11 +1521,10 @@ def _download_and_process_table_data(
1570
1521
  result,
1571
1522
  table_type,
1572
1523
  table_version_obj.schema,
1573
- entry_index_to_schema,
1574
1524
  file_path_column,
1575
1525
  columns,
1576
1526
  )
1577
- # Convert to numpy if original request was for numpy
1527
+ # Convert pandas to numpy if original request was for numpy
1578
1528
  if original_read_as == DatasetType.NUMPY:
1579
1529
  return _convert_pandas_to_numpy(result)
1580
1530
 
@@ -1589,22 +1539,25 @@ def _convert_pandas_to_numpy(dataset: Dataset):
1589
1539
 
1590
1540
 
1591
1541
  def _coerce_dataset_to_schema(
1592
- dataset: Dataset, target_schema: pa.Schema, manifest_entry_schema: Schema
1542
+ dataset: Dataset,
1543
+ target_schema: pa.Schema,
1593
1544
  ) -> Dataset:
1594
1545
  """Coerce a dataset to match the target PyArrow schema using DeltaCAT Schema.coerce method."""
1595
1546
  # Convert target PyArrow schema to DeltaCAT schema and use its coerce method
1596
1547
  deltacat_schema = Schema.of(schema=target_schema)
1597
- return deltacat_schema.coerce(dataset, manifest_entry_schema)
1548
+ return deltacat_schema.coerce(dataset)
1598
1549
 
1599
1550
 
1600
1551
  def _coerce_results_to_schema(
1601
- results: Dataset, target_schema: pa.Schema, entry_index_to_schema: List[Schema]
1552
+ results: Dataset,
1553
+ target_schema: pa.Schema,
1602
1554
  ) -> List[Dataset]:
1603
1555
  """Coerce all table results to match the target schema."""
1604
1556
  coerced_results = []
1605
1557
  for i, table_result in enumerate(results):
1606
1558
  coerced_result = _coerce_dataset_to_schema(
1607
- table_result, target_schema, entry_index_to_schema[i]
1559
+ table_result,
1560
+ target_schema,
1608
1561
  )
1609
1562
  coerced_results.append(coerced_result)
1610
1563
  logger.debug(f"Coerced table {i} to unified schema")
@@ -1631,35 +1584,10 @@ def _create_target_schema(
1631
1584
  return arrow_schema
1632
1585
 
1633
1586
 
1634
- def _create_entry_schemas_for_concatenation(
1635
- entry_index_to_schema: List[Schema],
1636
- columns: Optional[List[str]] = None,
1637
- file_path_column: Optional[str] = None,
1638
- ) -> List[Schema]:
1639
- """Create entry schemas for concatenation, optionally filtered by column selection."""
1640
- if columns is None:
1641
- # No column selection - return original schemas as-is
1642
- return entry_index_to_schema
1643
-
1644
- # Column selection - filter each entry schema
1645
- modified_schemas = []
1646
- for entry_schema in entry_index_to_schema:
1647
- if entry_schema and entry_schema.arrow:
1648
- filtered_schema = _create_target_schema(
1649
- entry_schema.arrow, columns, file_path_column
1650
- )
1651
- modified_schemas.append(Schema.of(schema=filtered_schema))
1652
- else:
1653
- modified_schemas.append(entry_schema)
1654
-
1655
- return modified_schemas
1656
-
1657
-
1658
1587
  def _handle_local_table_concatenation(
1659
1588
  results: Dataset,
1660
1589
  table_type: DatasetType,
1661
1590
  table_schema: Optional[Schema],
1662
- entry_index_to_schema: List[Schema],
1663
1591
  file_path_column: Optional[str] = None,
1664
1592
  columns: Optional[List[str]] = None,
1665
1593
  ) -> Dataset:
@@ -1670,14 +1598,10 @@ def _handle_local_table_concatenation(
1670
1598
  target_schema = _create_target_schema(table_schema.arrow, columns, file_path_column)
1671
1599
  logger.debug(f"Created target schema: {target_schema.names}")
1672
1600
 
1673
- # Filter entry schemas to match column selection and file_path_column
1674
- modified_entry_schemas = _create_entry_schemas_for_concatenation(
1675
- entry_index_to_schema, columns, file_path_column
1676
- )
1677
-
1678
1601
  # Coerce results to unified schema
1679
1602
  coerced_results = _coerce_results_to_schema(
1680
- results, target_schema, modified_entry_schemas
1603
+ results,
1604
+ target_schema,
1681
1605
  )
1682
1606
 
1683
1607
  # Second step: concatenate the coerced results
@@ -1,10 +1,10 @@
1
1
  import ray
2
- import deltacat
2
+ import deltacat as dc
3
3
  import daft
4
4
 
5
5
 
6
6
  def print_package_version_info():
7
- print(f"DeltaCAT Version: {deltacat.__version__}")
7
+ print(f"DeltaCAT Version: {dc.__version__}")
8
8
  print(f"Ray Version: {ray.__version__}")
9
9
  print(f"Daft Version: {daft.__version__}")
10
10
 
@@ -12,18 +12,24 @@ def print_package_version_info():
12
12
  @ray.remote
13
13
  def hello_worker():
14
14
  print("Hello, Worker!")
15
+ df = daft.from_pydict({"hello": ["delta", "cat"]})
16
+ dc.write(df, "hello_world")
15
17
  print_package_version_info()
16
18
 
17
19
 
18
20
  def run():
19
21
  print("Hello, Driver!")
20
22
  print_package_version_info()
21
- hello_worker.remote()
23
+ ray.get(hello_worker.remote())
24
+ df = dc.read("hello_world")
25
+ print("=== Table Written by Ray Worker ===")
26
+ print(df)
22
27
 
23
28
 
24
29
  if __name__ == "__main__":
25
30
  # initialize deltacat
26
- deltacat.init()
31
+ # Catalog files will be stored in .deltacat/ in the current working directory.
32
+ dc.init_local()
27
33
 
28
34
  # run the example
29
35
  run()
@@ -90,6 +90,9 @@ def run(
90
90
 
91
91
  if __name__ == "__main__":
92
92
  """
93
+ This example script demonstrates how to use the `deltacat.copy` API to copy multimodal source files into
94
+ arbitrary destinations with optional file format conversion and UDF transformations using DeltaCAT URLs.
95
+
93
96
  Example 1: Run this script locally using Ray:
94
97
  $ python indexer.py \
95
98
  $ --source 'text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31' \
@@ -105,7 +105,12 @@ def run(
105
105
 
106
106
  if __name__ == "__main__":
107
107
  """
108
- # Run this example through a command of the form:
108
+ This example shows how to submit jobs to a remote Ray cluster that indexes source files into arbitrary destinations with
109
+ optional file format conversion using DeltaCAT URLs. It provides the option to run multiple sequential or concurrent jobs
110
+ for benchmarking.
111
+
112
+ # For example, the following command launches a remote Ray Cluster on AWS, downloads an external OpenAlex dataset text file,
113
+ # converts it to Parquet, and writes it back to AWS S3. It submits 100 jobs in parallel, each with a timeout of 90 seconds:
109
114
  $ python ./deltacat/examples/job_runner.py -- \
110
115
  $ --source text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31 \
111
116
  $ --dest parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet \
@@ -23,6 +23,7 @@ from deltacat.exceptions import (
23
23
  SchemaValidationError,
24
24
  )
25
25
  from deltacat.storage.model.types import (
26
+ LocalTable,
26
27
  SchemaConsistencyType,
27
28
  SortOrder,
28
29
  NullOrder,
@@ -30,6 +31,7 @@ from deltacat.storage.model.types import (
30
31
  from deltacat.types.tables import (
31
32
  get_table_length,
32
33
  to_pyarrow,
34
+ get_table_column_names,
33
35
  from_pyarrow,
34
36
  get_dataset_type,
35
37
  SchemaEvolutionMode,
@@ -1174,8 +1176,7 @@ class Schema(dict):
1174
1176
 
1175
1177
  def coerce(
1176
1178
  self,
1177
- dataset: Union[pa.Table, pd.DataFrame, np.ndarray, Any],
1178
- manifest_entry_schema: Optional[Schema] = None,
1179
+ dataset: LocalTable,
1179
1180
  ) -> Union[pa.Table, pd.DataFrame, np.ndarray, Any]:
1180
1181
  """Coerce a dataset to match this schema using field type promotion.
1181
1182
 
@@ -1196,7 +1197,6 @@ class Schema(dict):
1196
1197
 
1197
1198
  Args:
1198
1199
  dataset: Dataset to coerce to this schema
1199
- manifest_entry_schema: Original manifest entry schema used to write the dataset.
1200
1200
 
1201
1201
  Returns:
1202
1202
  Dataset of the same type, coerced to match this schema.
@@ -1208,10 +1208,23 @@ class Schema(dict):
1208
1208
  # No fields defined in schema, return original dataset
1209
1209
  return dataset
1210
1210
 
1211
+ # Create pyarrow schema of fields common to the table schema and input dataset
1212
+ common_fields = []
1213
+ dataset_column_names = [
1214
+ name.lower() for name in get_table_column_names(dataset)
1215
+ ]
1216
+ for field in self.fields:
1217
+ if field.arrow.name.lower() in dataset_column_names:
1218
+ common_fields.append(field.arrow)
1219
+ # If no common fields, return original dataset
1220
+ if not common_fields:
1221
+ return dataset
1222
+ common_schema = pa.schema(common_fields)
1223
+
1211
1224
  # Convert dataset to PyArrow table for processing
1212
1225
  pa_table = to_pyarrow(
1213
1226
  dataset,
1214
- schema=manifest_entry_schema.arrow if manifest_entry_schema else None,
1227
+ schema=common_schema,
1215
1228
  )
1216
1229
 
1217
1230
  # Process columns using field coercion
@@ -35,7 +35,15 @@ class TestUuidBlockWritePathProvider(unittest.TestCase):
35
35
  result = provider("base_path")
36
36
 
37
37
  self.assertTrue(isinstance(provider, FilenameProvider))
38
- self.assertRegex(result, r"^base_path/[\w-]{36}$")
38
+ # assert that the result is a valid UUID
39
+ self.assertRegex(
40
+ result, r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"
41
+ )
42
+ # after deleting the provider, expect to capture one write path with the base path as the prefix
43
+ del provider
44
+ write_paths = capture_object.write_paths()
45
+ self.assertEqual(len(write_paths), 1)
46
+ self.assertEqual(write_paths[0], f"base_path/{result}")
39
47
 
40
48
 
41
49
  class TestDownloadUpload(unittest.TestCase):