deltacat 2.0.0.post1__py3-none-any.whl → 2.0.0.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/utils/url.py CHANGED
@@ -1,3 +1,6 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+
1
4
  import functools
2
5
  import json
3
6
  from typing import Callable, List, Tuple, Any, Union, Optional
@@ -231,7 +234,18 @@ RAY_DATASTORE_TYPE_TO_WRITER = {
231
234
  ),
232
235
  }
233
236
 
237
+
238
+ def _daft_binary_reader(url_path: str) -> daft.DataFrame:
239
+ df = daft.from_pydict({"url": [url_path]})
240
+ return df.with_column("data", df["url"].url.download())
241
+
242
+
234
243
  DAFT_DATASTORE_TYPE_TO_READER = {
244
+ DatastoreType.BINARY: lambda url: functools.partial(
245
+ _daft_binary_reader,
246
+ url.url_path,
247
+ **url.query_params,
248
+ ),
235
249
  DatastoreType.CSV: lambda url: functools.partial(
236
250
  daft.io.read_csv,
237
251
  url.url_path,
@@ -629,17 +643,25 @@ class DeltaCatUrl:
629
643
  avro+<scheme>://<path>?param1=val1&param2=val2&...
630
644
  binary+<scheme>://<path>?param1=val1&param2=val2&...
631
645
  csv+<scheme>://<path>?param1=val1&param2=val2&...
646
+ deltalake+<scheme>://<path>?param1=val1&param2=val2&...
632
647
  deltasharing+<scheme>://<path>?param1=val1&param2=val2&...
648
+ feather+<scheme>://<path>?param1=val1&param2=val2&...
649
+ hdf+<scheme>://<path>?param1=val1&param2=val2&...
650
+ html+<scheme>://<path>?param1=val1&param2=val2&...
633
651
  hudi+<scheme>://<path>?param1=val1&param2=val2&...
634
652
  images+<scheme>://<path>?param1=val1&param2=val2&...
635
653
  json+<scheme>://<path>?param1=val1&param2=val2&...
636
654
  lance+<scheme>://<path>?param1=val1&param2=val2&...
637
655
  numpy+<scheme>://<path>?param1=val1&param2=val2&...
656
+ orc+<scheme>://<path>?param1=val1&param2=val2&...
638
657
  parquet+<scheme>://<path>?param1=val1&param2=val2&...
639
658
  text+<scheme>://<path>?param1=val1&param2=val2&...
640
659
  tfrecords+<scheme>://<path>?param1=val1&param2=val2&...
660
+ text+<scheme>://<path>?param1=val1&param2=val2&...
661
+ warc+<scheme>://<path>?param1=val1&param2=val2&...
641
662
  videos+<scheme>://<path>?param1=val1&param2=val2&...
642
663
  webdataset+<scheme>://<path>?param1=val1&param2=val2&...
664
+ xml+<scheme>://<path>?param1=val1&param2=val2&...
643
665
 
644
666
  Some DeltaCAT URLs reference special types of external objects
645
667
  locatable via custom URLs that don't conform to the usual
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deltacat
3
- Version: 2.0.0.post1
3
+ Version: 2.0.0.post2
4
4
  Summary: DeltaCAT is a portable Pythonic Data Lakehouse powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -57,6 +57,8 @@ DeltaCAT is a portable Pythonic Data Lakehouse powered by [Ray](https://github.c
57
57
  fast, scalable, ACID-compliant multimodal data lakes, and has been used to [successfully manage exabyte-scale enterprise
58
58
  data lakes](https://aws.amazon.com/blogs/opensource/amazons-exabyte-scale-migration-from-apache-spark-to-ray-on-amazon-ec2/).
59
59
 
60
+ It provides data lake level transactions & time travel, fast schema evolution for feature enrichment, zero-copy multimodal file processing, schemaless dataset management, and transparent dataset optimization. It runs locally for rapid development or in the cloud for production workloads.
61
+
60
62
  It uses the Ray distributed compute framework together with [Apache Arrow](https://github.com/apache/arrow) and
61
63
  [Daft](https://github.com/Eventual-Inc/Daft) to efficiently scale common table management tasks, like petabyte-scale
62
64
  merge-on-read and copy-on-write operations.
@@ -79,10 +81,14 @@ Data consumers that prefer to stay within the ecosystem of Pythonic data managem
79
81
  ## Getting Started
80
82
  DeltaCAT applications run anywhere that Ray runs, including your local laptop, cloud computing cluster, or on-premise cluster.
81
83
 
82
- DeltaCAT lets you manage **Tables** across one or more **Catalogs**. A **Table** can be thought of as a named collection of one or more data files. A **Catalog** provides a root location (e.g., a local file path or S3 Bucket) to store table information, and can be rooted in any [PyArrow-compatible Filesystem](https://arrow.apache.org/docs/python/filesystems.html). **Tables** can be created, read, and written using the `dc.write` and `dc.read` APIs.
84
+ DeltaCAT lets you manage **Tables** across one or more **Catalogs**. A **Table** can be thought of as a named collection of data files. A **Catalog** can be thought of as a named data lake containing a set of **Tables**. It provides a root location (e.g., a local file path or S3 Bucket) to store table information, and can be rooted in any [PyArrow-compatible Filesystem](https://arrow.apache.org/docs/python/filesystems.html). **Tables** can be created, read, and written using the `dc.write` and `dc.read` APIs.
83
85
 
84
86
  ### Quick Start
85
87
 
88
+ Install DeltaCAT with: `pip install deltacat`
89
+
90
+ Then run this script to create and read your first table:
91
+
86
92
  ```python
87
93
  import deltacat as dc
88
94
  import pandas as pd
@@ -109,7 +115,7 @@ daft_df = dc.read("users") # Returns Daft DataFrame (default)
109
115
  daft_df.show() # Materialize and print the DataFrame
110
116
 
111
117
  # Append more data and add a new column.
112
- # Compaction and schema evolution are handled automatically.
118
+ # Compaction and zero-copy schema evolution are handled automatically.
113
119
  data = pd.DataFrame({
114
120
  "id": [4, 5, 6],
115
121
  "name": ["Tom", "Simpkin", "Delta"],
@@ -129,7 +135,7 @@ DeltaCAT can do much more than just append data to tables and read it back again
129
135
 
130
136
  <details>
131
137
 
132
- <summary><span style="font-size: 1.25em; font-weight: bold;">Replacing and Dropping Tables</span></summary>
138
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Idempotent Writes</span></summary>
133
139
 
134
140
  If you run the quick start example repeatedly from the same working directory, you'll notice that the table it writes to just keeps growing larger. This is because DeltaCAT always **appends** table data by default. One way to prevent this perpetual table growth and make the example idempotent is to use the **REPLACE** write mode if the table already exists:
135
141
 
@@ -239,7 +245,7 @@ assert dc.dataset_length(daft_df) == 6
239
245
 
240
246
  <details>
241
247
 
242
- <summary><span style="font-size: 1.25em; font-weight: bold;">Supported Dataset and File Formats</span></summary>
248
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Multi-Format Data Processing</span></summary>
243
249
 
244
250
  DeltaCAT natively supports a variety of open dataset and file formats already integrated with Ray and Arrow. You can use `dc.read` to read tables back as a Daft DataFrame, Ray Dataset, Pandas DataFrame, PyArrow Table, Polars DataFrame, NumPy Array, or list of PyArrow ParquetFile objects:
245
251
 
@@ -329,7 +335,7 @@ print("\n=== NumPy Table ===")
329
335
  dc.read("my_numpy_table").show()
330
336
  ```
331
337
 
332
- Or write to different table file formats:
338
+ DeltaCAT tables also support persisting data in heterogeneous table file formats like Avro, ORC, or Feather:
333
339
 
334
340
  ```python
335
341
  data = pd.DataFrame({"id": [1], "name": ["Cheshire"], "age": [3]})
@@ -372,9 +378,9 @@ print(pandas_df)
372
378
 
373
379
  <details>
374
380
 
375
- <summary><span style="font-size: 1.25em; font-weight: bold;">Merging and Deleting Data</span></summary>
381
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Live Feature Enrichment</span></summary>
376
382
 
377
- DeltaCAT can automatically merge and delete data by defining a table schema with one or more merge keys:
383
+ DeltaCAT can update your datasets on-the-fly to keep up with a continuous stream of new insights, and support common ML use-cases like feature enrichment. Just define a table schema with one or more merge keys to start updating and deleting existing records:
378
384
 
379
385
  ```python
380
386
  import deltacat as dc
@@ -385,53 +391,50 @@ import tempfile
385
391
  # Initialize DeltaCAT with a fresh temporary catalog
386
392
  dc.init_local(tempfile.mkdtemp())
387
393
 
388
- # Define a schema with user_id as a merge key.
389
- schema = dc.Schema.of([
394
+ # Start with minimal schema - just user_id as merge key and name
395
+ initial_schema = dc.Schema.of([
390
396
  dc.Field.of(pa.field("user_id", pa.int64()), is_merge_key=True),
391
397
  dc.Field.of(pa.field("name", pa.string())),
392
- dc.Field.of(pa.field("age", pa.int32())),
393
- dc.Field.of(pa.field("status", pa.string())),
394
398
  ])
395
399
 
396
- # Initial user data
400
+ # Initial user data - just basic info
397
401
  initial_users = pd.DataFrame({
398
402
  "user_id": [1, 2, 3],
399
- "name": ["Cheshire", "Dinah", "Felix"],
400
- "age": [3, 7, 2],
401
- "status": ["active", "active", "inactive"]
403
+ "name": ["Jim", "Dinah", "Bob"],
402
404
  })
403
405
 
404
- # Write initial data with the merge key schema
405
- dc.write(initial_users, "users", schema=schema)
406
+ # Write initial data with minimal schema
407
+ dc.write(initial_users, "users", schema=initial_schema)
406
408
 
407
- # Read the data back as a Pandas DataFrame.
409
+ # Read the data back as a Pandas DataFrame
408
410
  df = dc.read("users", read_as=dc.DatasetType.PANDAS)
409
- print("=== Initial Users ===")
411
+ print("=== Initial Users (Basic Info) ===")
410
412
  print(df.sort_values("user_id"))
411
413
 
412
- # Update data for existing users + add new users
413
- updated_users = pd.DataFrame({
414
- "user_id": [2, 3, 4, 5, 6],
415
- "name": ["Dinah", "Felix", "Tom", "Simpkin", "Delta"],
416
- "age": [7, 2, 5, 12, 4],
417
- "status": ["premium", "active", "active", "active", "active"]
414
+ # Later, enrich with new insights: add age/job features + new users
415
+ enriched_data = pd.DataFrame({
416
+ "user_id": [1, 3, 4, 5, 6],
417
+ "name": ["Cheshire", "Felix", "Tom", "Simpkin", "Delta"],
418
+ "age": [3, 2, 5, 12, 4],
419
+ "job": ["Tour Guide", "Drifter", "Housekeeper", "Mouser", "Engineer"]
418
420
  })
419
421
 
420
- # Write automatically detects that the schema has a merge key and:
421
- # 1. Updates existing records with matching user IDs.
422
- # 2. Inserts new records with new user IDs.
423
- dc.write(updated_users, "users", schema=schema)
422
+ # DeltaCAT automatically evolves the schema and merges by user_id:
423
+ # 1. Enriches existing users (Jim -> Cheshire age=3, job="Tour Guide"; Bob -> Felix)
424
+ # 2. Adds new age/job columns with automatic schema evolution
425
+ # 3. Inserts new users (Tom, Simpkin, Delta) with full feature set
426
+ dc.write(enriched_data, "users")
424
427
 
425
- # Read back to see merged results
428
+ # Read back to see live feature enrichment results
426
429
  df = dc.read("users", read_as=dc.DatasetType.PANDAS)
427
- print("\n=== After Merge ===")
430
+ print("\n=== Enriched Users (Age & Job) ===")
428
431
  print(df.sort_values("user_id"))
429
432
 
430
- # - Cheshire (user_id=1) remains unchanged
431
- # - Dinah (user_id=2) status updated to "premium"
432
- # - Felix (user_id=3) updated to "active"
433
- # - New users (4,5,6), (Tom, Simpkin, Delta) added
434
- # - No duplicate user_id values exist
433
+ # - Cheshire (user_id=1) name updated from Jim, gets age=3, job="Tour Guide"
434
+ # - Dinah (user_id=2) keeps original name, gets null age/job (missing features)
435
+ # - Felix (user_id=3) name updated from Bob, gets age=2, job="Drifter"
436
+ # - New users (4,5,6) added with complete feature set
437
+ # - Schema automatically evolved to include age/job columns
435
438
 
436
439
  # Specify the users to delete.
437
440
  # We only need to specify matching merge key values.
@@ -440,7 +443,7 @@ users_to_delete = pd.DataFrame({
440
443
  })
441
444
 
442
445
  # Delete the records that match our merge keys.
443
- dc.write(users_to_delete, "users", schema=schema, mode=dc.TableWriteMode.DELETE)
446
+ dc.write(users_to_delete, "users", mode=dc.TableWriteMode.DELETE)
444
447
 
445
448
  # Read the table back to confirm target users have been deleted.
446
449
  df = dc.read("users", read_as=dc.DatasetType.PANDAS)
@@ -456,6 +459,117 @@ print(df.sort_values("user_id"))
456
459
 
457
460
  <details>
458
461
 
462
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Zero-Copy Multimodal URL Processing</span></summary>
463
+
464
+ DeltaCAT can register and process existing multimodal datasets from local or remote URLs. This enables zero-copy distributed processing of images, audio, text, and other file formats:
465
+
466
+ ```python
467
+ import deltacat as dc
468
+ import pandas as pd
469
+ import pyarrow as pa
470
+ import tempfile
471
+ import ray
472
+
473
+ # Initialize DeltaCAT with a fresh temporary catalog
474
+ dc.init_local(tempfile.mkdtemp())
475
+
476
+ # Create dataset with DeltaCAT URLs pointing to existing files
477
+ urls_df = pd.DataFrame({
478
+ "file_id": [1, 2, 3, 4, 5, 6],
479
+ "url": [
480
+ # URLs with common file extensions will have their content type inferred.
481
+ "https://picsum.photos/id/237/400/300.jpg",
482
+ "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv",
483
+ "https://raw.githubusercontent.com/SergLam/Audio-Sample-files/master/sample.mp3",
484
+ "https://raw.githubusercontent.com/burningtree/awesome-json/master/README.md",
485
+ "https://raw.githubusercontent.com/microsoft/vscode/main/package.json",
486
+ # URLs without common file extensions will be read as binary by default.
487
+ "https://picsum.photos/200"
488
+ ]
489
+ })
490
+
491
+ # Create empty table with merge key to efficiently add insights about each file
492
+ dc.create_table(
493
+ "multimodal_files",
494
+ schema=dc.Schema.of([
495
+ dc.Field.of(pa.field("file_id", pa.int64()), is_merge_key=True),
496
+ dc.Field.of(pa.field("url", pa.string()))
497
+ ])
498
+ )
499
+
500
+ # Write URLs to DeltaCAT table
501
+ dc.write(urls_df, "multimodal_files")
502
+
503
+ # UDF to process each file in parallel using Ray Dataset map method
504
+ def analyze_file(row):
505
+ file_id = row["file_id"]
506
+ url = row["url"]
507
+
508
+ # DeltaCAT automatically infers the right Ray Data reader for the URL
509
+ dataset = dc.get(url)
510
+ records = dataset.take_all()
511
+ url_type = dc.DatastoreType.from_url(url)
512
+
513
+ # Extract standard Ray Dataset fields for each file type
514
+ if url_type == dc.DatastoreType.IMAGES:
515
+ image = records[0]["image"]
516
+ analysis = f"Image {image.shape[1]}x{image.shape[0]} pixels"
517
+ elif url_type == dc.DatastoreType.CSV:
518
+ analysis = f"CSV with {len(records)} rows, {len(records[0].keys())} columns"
519
+ elif url_type == dc.DatastoreType.AUDIO:
520
+ sample_rate = records[0]["sample_rate"]
521
+ duration = len(records[0]["amplitude"][0]) / sample_rate
522
+ analysis = f"Audio {duration:.1f}s, {sample_rate}Hz"
523
+ elif url_type == dc.DatastoreType.JSON:
524
+ analysis = f"JSON with {len(records[0].keys())} fields"
525
+ elif url_type == dc.DatastoreType.TEXT:
526
+ analysis = f"Text with {len(records)} records"
527
+ else:
528
+ analysis = f"Binary with {len(records[0]['bytes'])} bytes"
529
+
530
+ return {"file_id": file_id, "analysis": analysis}
531
+
532
+ # Read the multimodal_files table as a Ray Dataset
533
+ ray_dataset = dc.read("multimodal_files", read_as=dc.DatasetType.RAY_DATASET)
534
+ # Download and analyze each URL in parallel using map
535
+ results_dataset = ray_dataset.map(analyze_file)
536
+
537
+ # Write results back to the multimodal_files table
538
+ dc.write(results_dataset, "multimodal_files", mode=dc.TableWriteMode.MERGE)
539
+
540
+ # Read final results and compare to initial dataset
541
+ print("\n=== Initial Dataset ===")
542
+ print(dc.to_pandas(ray_dataset))
543
+
544
+ print("\n=== Final Results with Analysis ===")
545
+ print(dc.read("multimodal_files", read_as=dc.DatasetType.PANDAS))
546
+ ```
547
+
548
+ The default dataset type used by `dc.get` is a Ray Dataset but, similar to `dc.read`, `dc.get` can also read URLs into other dataset types like Daft:
549
+
550
+ ```python
551
+ import deltacat as dc
552
+
553
+ # Create dataset with DeltaCAT URLs pointing to existing files
554
+ urls = [
555
+ # URLs with common file extensions will have their content type inferred.
556
+ "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv",
557
+ "https://raw.githubusercontent.com/burningtree/awesome-json/master/README.md",
558
+ # URLs without common file extensions will be read as binary by default.
559
+ "https://picsum.photos/200"
560
+ ]
561
+
562
+ # Download each URL into a Daft DataFrame serially
563
+ for url in urls:
564
+ dataset = dc.get(url, read_as=dc.DatasetType.DAFT)
565
+ print(f"\n=== {url} ===")
566
+ print(dataset.show())
567
+ ```
568
+
569
+ </details>
570
+
571
+ <details>
572
+
459
573
  <summary><span style="font-size: 1.25em; font-weight: bold;">Organizing Tables with Namespaces</span></summary>
460
574
 
461
575
  In DeltaCAT, table **Namespaces** are optional but useful for organizing related tables within a catalog:
@@ -534,9 +648,9 @@ print(finance_df)
534
648
 
535
649
  <details>
536
650
 
537
- <summary><span style="font-size: 1.25em; font-weight: bold;">Multi-Table Transactions</span></summary>
651
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Data Lake Level Transactions</span></summary>
538
652
 
539
- DeltaCAT transactions can span multiple tables and namespaces. Since all operations within a transaction either succeed or fail together, this simplifies keeping related datasets in sync across your entire catalog.
653
+ DeltaCAT transactions can span multiple tables and namespaces. Since transaction history is maintained at the catalog level, every transaction operates against a consistent snapshot of every object in your data lake. Since all operations within a transaction either succeed or fail together, this simplifies keeping related datasets in sync across your entire catalog.
540
654
 
541
655
  Consider the previous example that organized tables with namespaces. One table tracked customer orders, and another table tracked the lifetime payments of each customer. If one table was updated but not the other, then it would result in an accounting discrepancy. This edge case can be eliminated by using multi-table transactions:
542
656
 
@@ -630,7 +744,7 @@ print(dc.read("users", namespace="finance", read_as=dc.DatasetType.PANDAS))
630
744
 
631
745
  <details>
632
746
 
633
- <summary><span style="font-size: 1.25em; font-weight: bold;">Working with Multiple Catalogs</span></summary>
747
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Managing Multiple Data Lakes</span></summary>
634
748
 
635
749
  DeltaCAT lets you work with multiple catalogs in a single application. All catalogs registered with DeltaCAT are tracked by a Ray Actor to make them available to all workers in your Ray application.
636
750
 
@@ -652,8 +766,8 @@ dc.init(catalogs={
652
766
  filesystem=pa.fs.LocalFileSystem()
653
767
  )),
654
768
  "prod": dc.Catalog(config=dc.CatalogProperties(
655
- root=tempfile.mkdtemp(), # Use temporary directory for prod
656
- filesystem=pa.fs.LocalFileSystem()
769
+ root="s3://example/deltacat/", # Use S3 for prod
770
+ filesystem=pa.fs.S3FileSystem()
657
771
  ))
658
772
  })
659
773
 
@@ -705,9 +819,9 @@ print(dc.read("financial_data", catalog="prod", read_as=dc.DatasetType.PANDAS))
705
819
 
706
820
  <details>
707
821
 
708
- <summary><span style="font-size: 1.25em; font-weight: bold;">Transaction History & Time Travel</span></summary>
822
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Data Lake Level Time Travel</span></summary>
709
823
 
710
- DeltaCAT supports time travel queries that let you read all tables in a catalog as they existed at any point in the past. Combined with multi-table transactions, this enables consistent point-in-time views across your entire data catalog.
824
+ DeltaCAT supports time travel queries that let you read all tables in a catalog as they existed at any point in the past. Combined with catalog-level transactions, this enables consistent point-in-time views across your entire data lake.
711
825
 
712
826
  ```python
713
827
  import deltacat as dc
@@ -847,7 +961,7 @@ print("\nTime travel validation successful!")
847
961
 
848
962
  <summary><span style="font-size: 1.25em; font-weight: bold;">Multimodal Batch Inference</span></summary>
849
963
 
850
- DeltaCAT's support for merging new fields into existing records and multimodal datasets can be used to build a multimodal batch inference pipeline. For example, the following code indexes images of cats, then merges in new fields with breed precitions predictions for each image:
964
+ DeltaCAT's support for merging new fields into existing records and multimodal datasets can be used to build a multimodal batch inference pipeline. For example, the following code indexes images of cats, then merges in new fields with breed predictions for each image:
851
965
 
852
966
  > **Requirements**: This example requires PyTorch ≥ 2.8.0 and torchvision ≥ 0.23.0. Install via: `pip install torch>=2.8.0 torchvision>=0.23.0`
853
967
 
@@ -938,7 +1052,7 @@ final_df.show()
938
1052
 
939
1053
  <summary><span style="font-size: 1.25em; font-weight: bold;">LLM Batch Inference</span></summary>
940
1054
 
941
- DeltaCAT multi-table transactions, time travel queries, and automatic schema evolution can be used to create auditable LLM batch inference pipelines. For example, the following code tries different approaches to analyze the overall tone of customer feedback, then generates customer service responses based on the analysis:
1055
+ DeltaCAT multi-table transactions, data lake time travel, and automatic schema evolution can be used to create auditable LLM batch inference pipelines. For example, the following code tries different approaches to analyze the overall tone of customer feedback, then generates customer service responses based on the analysis:
942
1056
 
943
1057
  ```python
944
1058
  import deltacat as dc
@@ -1,6 +1,6 @@
1
- deltacat/__init__.py,sha256=mTb9CK1GTZuGmALUxQtr717n6eUNusG9tbcPyTpXNI8,4452
1
+ deltacat/__init__.py,sha256=8oHmukh7qFhVfUT89l4zBtonbu_wsoj2hJsPaka0PoA,4452
2
2
  deltacat/annotations.py,sha256=9lBi34DpIV_RPjCCK2Aiz_6nMyd-e-_CfQ1XtdRQQlM,1196
3
- deltacat/api.py,sha256=MwCB60tWzEru-Jv1tTWcxYWuID3e5GbCy1jwn4XiDXs,20497
3
+ deltacat/api.py,sha256=W7u3jeKZsvTWNi9zkhOC2O6BMwZ3TvMRbnbpcn6lFBo,21940
4
4
  deltacat/constants.py,sha256=HPE3SbK1-LRjtTu3OKD9s4N__LWMwj3xFP2N3Qy8fzM,4701
5
5
  deltacat/env.py,sha256=BJdTt8od3IVR4RMLjBxy4oRUHM7Lb16AzMOz8-hpwOI,2303
6
6
  deltacat/exceptions.py,sha256=dqZizcMKC3VwO7EgHXdAC4YUivBKVJgNwQLibMP93MA,16051
@@ -23,7 +23,7 @@ deltacat/catalog/__init__.py,sha256=lsu9N2G6P6HkyvrIpGY34SVkJM8-lwVaNfZanNTRjAc,
23
23
  deltacat/catalog/delegate.py,sha256=RDOQHaYvpvwc3RTZNaJhv00yXV1WHgE8YcD4i19H6g0,26870
24
24
  deltacat/catalog/interface.py,sha256=rmJSVi8dNORVa0ydzRFRwMcbpXwhDjYEpGAIGi-4O08,18486
25
25
  deltacat/catalog/main/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
- deltacat/catalog/main/impl.py,sha256=lztAIZ4MRGC2RFtGpbDwJdZgIWPuooX8JW_zG9tyzVk,103157
26
+ deltacat/catalog/main/impl.py,sha256=V-JeQkAIQOVwbupJ_-2sSMUQ5P12crcWF3qArFVHMg8,100224
27
27
  deltacat/catalog/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  deltacat/catalog/model/catalog.py,sha256=gvfczu9yhvDIjPjx5ZE69IUu1I_nhZOHURsOAakzhcQ,12765
29
29
  deltacat/catalog/model/properties.py,sha256=Bt7JgmG9UQD9ABqrCXniGrbRWpYWbini9ZCY8dBhifU,5416
@@ -132,7 +132,7 @@ deltacat/docs/autogen/schema/inference/generate_type_mappings.py,sha256=ZH30xcsA
132
132
  deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py,sha256=_N37jw5nmNlf40V2mOjDcXdJNhm1qoEa_fQdz_XRk1c,28929
133
133
  deltacat/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
134
  deltacat/examples/basic_logging.py,sha256=Umrum-gvY3gJjDNJ4hOMslMMq9bzeTM-s_DO4dGqJiw,2833
135
- deltacat/examples/hello_world.py,sha256=FvxkEDB1qVPJv55Fe1I7Coy0VLYJIisU7ZFYYkw9U2g,525
135
+ deltacat/examples/hello_world.py,sha256=dm4GNvNL_HElPtE50sZzaZFrV48BcRL89nZp9SnLSIw,799
136
136
  deltacat/examples/compactor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
137
  deltacat/examples/compactor/bootstrap.py,sha256=6BXDWsvH3QuSDmd31Wc0I4_qLy9lZTW4_029MGRslzA,35126
138
138
  deltacat/examples/compactor/compactor.py,sha256=_FbM9paIly4JK3FYP3t5nDPNL98I6K9UbhidachNaAE,12431
@@ -155,8 +155,8 @@ deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py,sha256=N
155
155
  deltacat/examples/experimental/iceberg/converter/beam/utils/common.py,sha256=wrUk-8sojz4sudZPMzCHyNVLsw1opBg23C9_q6z8AhA,6388
156
156
  deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py,sha256=CAMzNgeDDt4UKVTnUCEu8oRTB57rjBUwK6MxLLO3GBA,10046
157
157
  deltacat/examples/indexer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
- deltacat/examples/indexer/indexer.py,sha256=A5gfWomVtfu9uRrAB6UbitQ158ZFkviUscSveri1ggs,6334
159
- deltacat/examples/indexer/job_runner.py,sha256=M3ZKTcPL5gy1u_E3aZPal2u5INWZlBGDrsRhb5-F7n4,5921
158
+ deltacat/examples/indexer/indexer.py,sha256=7SqMfzte-PzSKcOsVQ9k-F9dODio70yzU0M0S3CldH8,6553
159
+ deltacat/examples/indexer/job_runner.py,sha256=Xwm6raw-Bx_Gq-8uMcw8ohdja2L6HgDlBLwCWDsRnbg,6398
160
160
  deltacat/examples/indexer/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
161
161
  deltacat/examples/indexer/gcp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
162
  deltacat/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -251,7 +251,7 @@ deltacat/storage/model/manifest.py,sha256=NnriTMm6waxixsjykfTABh4EWf985tA-A19AA2
251
251
  deltacat/storage/model/metafile.py,sha256=g-dgFX5fmW51EGhjiN5jpHR1LTOhf0jhUIMBapRD1Rw,58619
252
252
  deltacat/storage/model/namespace.py,sha256=9V1Qj232uc_UrVzZPIRzXyeYhJOYZ25wPLCx15-dx1Y,2630
253
253
  deltacat/storage/model/partition.py,sha256=UcHcBQV0Kf_RnVIFzoYQo8MUdOrjZdWsaiGUv8FKXx4,24298
254
- deltacat/storage/model/schema.py,sha256=jF8LvLvXUDI7pE2kv-LgGvGAuWRWC75JekqcS7147lc,122416
254
+ deltacat/storage/model/schema.py,sha256=bGtrm3xB0cr20HRICcRE6vQ5JKaTTrPGKIQ2cPaaWC8,122763
255
255
  deltacat/storage/model/shard.py,sha256=boPOW45bwLwBazfXZpa3-C5SUSlgelpHf8Yl6357Bq0,1575
256
256
  deltacat/storage/model/sort_key.py,sha256=68TJavprndKLESnWfCjXaeMwFE6tcq3ZVOHloE9rV6Q,7287
257
257
  deltacat/storage/model/stream.py,sha256=VJgqVy4NS6IHLBLRf5OyehldZbIrarqGZYN07XF4Yp4,12609
@@ -285,10 +285,10 @@ deltacat/tests/_io/reader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
285
285
  deltacat/tests/_io/reader/test_deltacat_read_api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
286
286
  deltacat/tests/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
287
287
  deltacat/tests/aws/test_clients.py,sha256=23GMWfz27WWBDXSqphG9mfputsyS7j3I5P_HRk4YoKE,3790
288
- deltacat/tests/aws/test_s3u.py,sha256=kL3cL37d-myF_NE0oP3SVTkhEt9yrqmJI83Xhr-i74Q,6869
288
+ deltacat/tests/aws/test_s3u.py,sha256=M27w8BDbv638ReYtr5kA2eXcd3xJmRwfOYxLV0tax_s,7268
289
289
  deltacat/tests/catalog/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
290
290
  deltacat/tests/catalog/test_catalogs.py,sha256=MFS_fISQq7VGzjaVVM1K9O_QSE-02SN0gHwUtQ7m-HU,11696
291
- deltacat/tests/catalog/test_default_catalog_impl.py,sha256=ISlJcuBVkYxUDsZuJoH1FyTeWtb7espbcFvcbY9OZ-o,468810
291
+ deltacat/tests/catalog/test_default_catalog_impl.py,sha256=_YiLAHCxECd_lQKaFu22qlH1dDWMQmxo6qRkpxuxPrI,476575
292
292
  deltacat/tests/catalog/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
293
293
  deltacat/tests/catalog/main/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
294
294
  deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py,sha256=XvMoW1yjjoIs4-0A8_dqeY7ArysN6HJkSSHk7JnHeUI,4313
@@ -404,9 +404,9 @@ deltacat/tests/utils/ray_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
404
404
  deltacat/tests/utils/ray_utils/test_concurrency.py,sha256=TjZpX0cjMDEIS79p_--j_BfT0zXKNkTLY1ZzNokBTs0,1211
405
405
  deltacat/tests/utils/ray_utils/test_dataset.py,sha256=glfihM4FBqqIWcW5SdU-SYqhmeMIPfl8Krfzj0oEviI,6418
406
406
  deltacat/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
407
- deltacat/types/media.py,sha256=8V02OmOhkoWODaB2M6fiJy4FBcVzODuynR4QHtEHbTg,12283
407
+ deltacat/types/media.py,sha256=yIWs6Wcb00bnZeQvwdpQoXYbjvw4BL81dWZU50XE23g,22317
408
408
  deltacat/types/partial_download.py,sha256=QIpNTSwaiZ4TVl4A1N4PtblevKT5GwdXtGrouQMQs1E,2510
409
- deltacat/types/tables.py,sha256=xAS_XBIOLr_Lp0C7kPD8Lk-ubDh6h-ZZtCFZLh-9vMs,85384
409
+ deltacat/types/tables.py,sha256=SrnPQB2-VFxkUSSRIDnImVhV39OnFJYHpZ3yH9Y3BI8,85083
410
410
  deltacat/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
411
411
  deltacat/utils/arguments.py,sha256=WzEjt8N4rXE5Vkcirz18ppZguBENxYm8F8m97LshV1Y,2162
412
412
  deltacat/utils/cloudpickle.py,sha256=XE7YDmQe56ksfl3NdYZkzOAhbHSuhNcBZGOehQpgZr0,1187
@@ -417,23 +417,23 @@ deltacat/utils/filesystem.py,sha256=jQ_vY0lBJcSuKqSOjSwB7q-s52ckYhJSvnGT_aYZvUc,
417
417
  deltacat/utils/metafile_locator.py,sha256=AJ6o2V5Cc7rJE89wWyKmsFIWOxmGM2APs8DCynmuTjg,2984
418
418
  deltacat/utils/metrics.py,sha256=HYKyZSrtVLu8gXezg_TMNUKJp4h1WWI0VEzn0Xlzf-I,10778
419
419
  deltacat/utils/numpy.py,sha256=tgq4j_9q9bERxsr0-h3t55BrciS2ivr1AZe7R1DldkA,5524
420
- deltacat/utils/pandas.py,sha256=4C9cdGDDUP9SSytjtgSdkhivrmmF43TwxN2qJhxsBKg,31226
420
+ deltacat/utils/pandas.py,sha256=v8wS_pArpyVJ1p3oYFs2uy6qt38httUYXfVHdAGlPso,31418
421
421
  deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
422
422
  deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
423
- deltacat/utils/polars.py,sha256=iYmgGRWrCjPEqwrf8bFY-oIjKfvo7jZar_GRagYxhTg,28838
424
- deltacat/utils/pyarrow.py,sha256=YwtYrWCWzEVFipQ-oE7lHpCYjJTdJujl9sPub8xtLYo,74233
423
+ deltacat/utils/polars.py,sha256=-_6CGDhhZ_g8Z2sTc_GBWvHSNXNWOszitxYeckrOQ9g,28906
424
+ deltacat/utils/pyarrow.py,sha256=i3__I5c1UCEjG8N1i2szWc2vHYUM4Hz1T1g1-mfQYfw,74410
425
425
  deltacat/utils/reader_compatibility_mapping.py,sha256=fZcNdw4kamkQF-ZzvBC4Zp_sbjxp0yOVIhLgV6V2Ee8,91409
426
426
  deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
427
427
  deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
428
- deltacat/utils/url.py,sha256=Meg4PQGzd_NQa966O0bcdyhalUoZ6-lt_A2g6suuqfI,44832
428
+ deltacat/utils/url.py,sha256=H9L6Pgr8MNtUqFBwPAygogbZ2mZ1Het308J7mu0kpyQ,45690
429
429
  deltacat/utils/ray_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
430
430
  deltacat/utils/ray_utils/collections.py,sha256=hj20s4D2RF2jZETU_44r6mFbsczA0JI_I_4kWKTmqes,1951
431
431
  deltacat/utils/ray_utils/concurrency.py,sha256=Ceui6nQYKHTUOTltHNQIdb0OWHFhD73o8DhSXP-DYRQ,5457
432
432
  deltacat/utils/ray_utils/dataset.py,sha256=5RnVqFlKoZ6zabnQfjfXAKWuXDMKvLp4eNcDgpFj3OM,6480
433
433
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
434
434
  deltacat/utils/ray_utils/runtime.py,sha256=cf5koY9q4TzRg--BjPtC6y0jztq45F39KcC4K6Wmg4w,6946
435
- deltacat-2.0.0.post1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
436
- deltacat-2.0.0.post1.dist-info/METADATA,sha256=RPLgM5MfzkMaH55vich_RCULmhvzmDi_2Zq6uK3-5aA,46952
437
- deltacat-2.0.0.post1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
438
- deltacat-2.0.0.post1.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
439
- deltacat-2.0.0.post1.dist-info/RECORD,,
435
+ deltacat-2.0.0.post2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
436
+ deltacat-2.0.0.post2.dist-info/METADATA,sha256=lIEs05JT0ZWkYJoM2JPh77nkvFhgVQ2yyieQPxA0Ofo,52106
437
+ deltacat-2.0.0.post2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
438
+ deltacat-2.0.0.post2.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
439
+ deltacat-2.0.0.post2.dist-info/RECORD,,