deltacat 2.0.0.post1__py3-none-any.whl → 2.0.0.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/api.py +44 -7
- deltacat/catalog/main/impl.py +34 -110
- deltacat/examples/hello_world.py +10 -4
- deltacat/examples/indexer/indexer.py +3 -0
- deltacat/examples/indexer/job_runner.py +6 -1
- deltacat/storage/model/schema.py +17 -4
- deltacat/tests/aws/test_s3u.py +9 -1
- deltacat/tests/catalog/test_default_catalog_impl.py +198 -7
- deltacat/types/media.py +282 -0
- deltacat/types/tables.py +5 -11
- deltacat/utils/pandas.py +11 -3
- deltacat/utils/polars.py +3 -1
- deltacat/utils/pyarrow.py +7 -3
- deltacat/utils/url.py +22 -0
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/METADATA +161 -47
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/RECORD +20 -20
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/licenses/LICENSE +0 -0
- {deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/top_level.txt +0 -0
deltacat/utils/url.py
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
1
4
|
import functools
|
2
5
|
import json
|
3
6
|
from typing import Callable, List, Tuple, Any, Union, Optional
|
@@ -231,7 +234,18 @@ RAY_DATASTORE_TYPE_TO_WRITER = {
|
|
231
234
|
),
|
232
235
|
}
|
233
236
|
|
237
|
+
|
238
|
+
def _daft_binary_reader(url_path: str) -> daft.DataFrame:
|
239
|
+
df = daft.from_pydict({"url": [url_path]})
|
240
|
+
return df.with_column("data", df["url"].url.download())
|
241
|
+
|
242
|
+
|
234
243
|
DAFT_DATASTORE_TYPE_TO_READER = {
|
244
|
+
DatastoreType.BINARY: lambda url: functools.partial(
|
245
|
+
_daft_binary_reader,
|
246
|
+
url.url_path,
|
247
|
+
**url.query_params,
|
248
|
+
),
|
235
249
|
DatastoreType.CSV: lambda url: functools.partial(
|
236
250
|
daft.io.read_csv,
|
237
251
|
url.url_path,
|
@@ -629,17 +643,25 @@ class DeltaCatUrl:
|
|
629
643
|
avro+<scheme>://<path>?param1=val1¶m2=val2&...
|
630
644
|
binary+<scheme>://<path>?param1=val1¶m2=val2&...
|
631
645
|
csv+<scheme>://<path>?param1=val1¶m2=val2&...
|
646
|
+
deltalake+<scheme>://<path>?param1=val1¶m2=val2&...
|
632
647
|
deltasharing+<scheme>://<path>?param1=val1¶m2=val2&...
|
648
|
+
feather+<scheme>://<path>?param1=val1¶m2=val2&...
|
649
|
+
hdf+<scheme>://<path>?param1=val1¶m2=val2&...
|
650
|
+
html+<scheme>://<path>?param1=val1¶m2=val2&...
|
633
651
|
hudi+<scheme>://<path>?param1=val1¶m2=val2&...
|
634
652
|
images+<scheme>://<path>?param1=val1¶m2=val2&...
|
635
653
|
json+<scheme>://<path>?param1=val1¶m2=val2&...
|
636
654
|
lance+<scheme>://<path>?param1=val1¶m2=val2&...
|
637
655
|
numpy+<scheme>://<path>?param1=val1¶m2=val2&...
|
656
|
+
orc+<scheme>://<path>?param1=val1¶m2=val2&...
|
638
657
|
parquet+<scheme>://<path>?param1=val1¶m2=val2&...
|
639
658
|
text+<scheme>://<path>?param1=val1¶m2=val2&...
|
640
659
|
tfrecords+<scheme>://<path>?param1=val1¶m2=val2&...
|
660
|
+
text+<scheme>://<path>?param1=val1¶m2=val2&...
|
661
|
+
warc+<scheme>://<path>?param1=val1¶m2=val2&...
|
641
662
|
videos+<scheme>://<path>?param1=val1¶m2=val2&...
|
642
663
|
webdataset+<scheme>://<path>?param1=val1¶m2=val2&...
|
664
|
+
xml+<scheme>://<path>?param1=val1¶m2=val2&...
|
643
665
|
|
644
666
|
Some DeltaCAT URLs reference special types of external objects
|
645
667
|
locatable via custom URLs that don't conform to the usual
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deltacat
|
3
|
-
Version: 2.0.0.
|
3
|
+
Version: 2.0.0.post2
|
4
4
|
Summary: DeltaCAT is a portable Pythonic Data Lakehouse powered by Ray.
|
5
5
|
Home-page: https://github.com/ray-project/deltacat
|
6
6
|
Author: Ray Team
|
@@ -57,6 +57,8 @@ DeltaCAT is a portable Pythonic Data Lakehouse powered by [Ray](https://github.c
|
|
57
57
|
fast, scalable, ACID-compliant multimodal data lakes, and has been used to [successfully manage exabyte-scale enterprise
|
58
58
|
data lakes](https://aws.amazon.com/blogs/opensource/amazons-exabyte-scale-migration-from-apache-spark-to-ray-on-amazon-ec2/).
|
59
59
|
|
60
|
+
It provides data lake level transactions & time travel, fast schema evolution for feature enrichment, zero-copy multimodal file processing, schemaless dataset management, and transparent dataset optimization. It runs locally for rapid development or in the cloud for production workloads.
|
61
|
+
|
60
62
|
It uses the Ray distributed compute framework together with [Apache Arrow](https://github.com/apache/arrow) and
|
61
63
|
[Daft](https://github.com/Eventual-Inc/Daft) to efficiently scale common table management tasks, like petabyte-scale
|
62
64
|
merge-on-read and copy-on-write operations.
|
@@ -79,10 +81,14 @@ Data consumers that prefer to stay within the ecosystem of Pythonic data managem
|
|
79
81
|
## Getting Started
|
80
82
|
DeltaCAT applications run anywhere that Ray runs, including your local laptop, cloud computing cluster, or on-premise cluster.
|
81
83
|
|
82
|
-
DeltaCAT lets you manage **Tables** across one or more **Catalogs**. A **Table** can be thought of as a named collection of
|
84
|
+
DeltaCAT lets you manage **Tables** across one or more **Catalogs**. A **Table** can be thought of as a named collection of data files. A **Catalog** can be thought of as a named data lake containing a set of **Tables**. It provides a root location (e.g., a local file path or S3 Bucket) to store table information, and can be rooted in any [PyArrow-compatible Filesystem](https://arrow.apache.org/docs/python/filesystems.html). **Tables** can be created, read, and written using the `dc.write` and `dc.read` APIs.
|
83
85
|
|
84
86
|
### Quick Start
|
85
87
|
|
88
|
+
Install DeltaCAT with: `pip install deltacat`
|
89
|
+
|
90
|
+
Then run this script to create and read your first table:
|
91
|
+
|
86
92
|
```python
|
87
93
|
import deltacat as dc
|
88
94
|
import pandas as pd
|
@@ -109,7 +115,7 @@ daft_df = dc.read("users") # Returns Daft DataFrame (default)
|
|
109
115
|
daft_df.show() # Materialize and print the DataFrame
|
110
116
|
|
111
117
|
# Append more data and add a new column.
|
112
|
-
# Compaction and schema evolution are handled automatically.
|
118
|
+
# Compaction and zero-copy schema evolution are handled automatically.
|
113
119
|
data = pd.DataFrame({
|
114
120
|
"id": [4, 5, 6],
|
115
121
|
"name": ["Tom", "Simpkin", "Delta"],
|
@@ -129,7 +135,7 @@ DeltaCAT can do much more than just append data to tables and read it back again
|
|
129
135
|
|
130
136
|
<details>
|
131
137
|
|
132
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
138
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Idempotent Writes</span></summary>
|
133
139
|
|
134
140
|
If you run the quick start example repeatedly from the same working directory, you'll notice that the table it writes to just keeps growing larger. This is because DeltaCAT always **appends** table data by default. One way to prevent this perpetual table growth and make the example idempotent is to use the **REPLACE** write mode if the table already exists:
|
135
141
|
|
@@ -239,7 +245,7 @@ assert dc.dataset_length(daft_df) == 6
|
|
239
245
|
|
240
246
|
<details>
|
241
247
|
|
242
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
248
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Multi-Format Data Processing</span></summary>
|
243
249
|
|
244
250
|
DeltaCAT natively supports a variety of open dataset and file formats already integrated with Ray and Arrow. You can use `dc.read` to read tables back as a Daft DataFrame, Ray Dataset, Pandas DataFrame, PyArrow Table, Polars DataFrame, NumPy Array, or list of PyArrow ParquetFile objects:
|
245
251
|
|
@@ -329,7 +335,7 @@ print("\n=== NumPy Table ===")
|
|
329
335
|
dc.read("my_numpy_table").show()
|
330
336
|
```
|
331
337
|
|
332
|
-
|
338
|
+
DeltaCAT tables also support persisting data in heterogeneous table file formats like Avro, ORC, or Feather:
|
333
339
|
|
334
340
|
```python
|
335
341
|
data = pd.DataFrame({"id": [1], "name": ["Cheshire"], "age": [3]})
|
@@ -372,9 +378,9 @@ print(pandas_df)
|
|
372
378
|
|
373
379
|
<details>
|
374
380
|
|
375
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
381
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Live Feature Enrichment</span></summary>
|
376
382
|
|
377
|
-
DeltaCAT can
|
383
|
+
DeltaCAT can update your datasets on-the-fly to keep up with a continuous stream of new insights, and support common ML use-cases like feature enrichment. Just define a table schema with one or more merge keys to start updating and deleting existing records:
|
378
384
|
|
379
385
|
```python
|
380
386
|
import deltacat as dc
|
@@ -385,53 +391,50 @@ import tempfile
|
|
385
391
|
# Initialize DeltaCAT with a fresh temporary catalog
|
386
392
|
dc.init_local(tempfile.mkdtemp())
|
387
393
|
|
388
|
-
#
|
389
|
-
|
394
|
+
# Start with minimal schema - just user_id as merge key and name
|
395
|
+
initial_schema = dc.Schema.of([
|
390
396
|
dc.Field.of(pa.field("user_id", pa.int64()), is_merge_key=True),
|
391
397
|
dc.Field.of(pa.field("name", pa.string())),
|
392
|
-
dc.Field.of(pa.field("age", pa.int32())),
|
393
|
-
dc.Field.of(pa.field("status", pa.string())),
|
394
398
|
])
|
395
399
|
|
396
|
-
# Initial user data
|
400
|
+
# Initial user data - just basic info
|
397
401
|
initial_users = pd.DataFrame({
|
398
402
|
"user_id": [1, 2, 3],
|
399
|
-
"name": ["
|
400
|
-
"age": [3, 7, 2],
|
401
|
-
"status": ["active", "active", "inactive"]
|
403
|
+
"name": ["Jim", "Dinah", "Bob"],
|
402
404
|
})
|
403
405
|
|
404
|
-
# Write initial data with
|
405
|
-
dc.write(initial_users, "users", schema=
|
406
|
+
# Write initial data with minimal schema
|
407
|
+
dc.write(initial_users, "users", schema=initial_schema)
|
406
408
|
|
407
|
-
# Read the data back as a Pandas DataFrame
|
409
|
+
# Read the data back as a Pandas DataFrame
|
408
410
|
df = dc.read("users", read_as=dc.DatasetType.PANDAS)
|
409
|
-
print("=== Initial Users ===")
|
411
|
+
print("=== Initial Users (Basic Info) ===")
|
410
412
|
print(df.sort_values("user_id"))
|
411
413
|
|
412
|
-
#
|
413
|
-
|
414
|
-
"user_id": [
|
415
|
-
"name": ["
|
416
|
-
"age": [
|
417
|
-
"
|
414
|
+
# Later, enrich with new insights: add age/job features + new users
|
415
|
+
enriched_data = pd.DataFrame({
|
416
|
+
"user_id": [1, 3, 4, 5, 6],
|
417
|
+
"name": ["Cheshire", "Felix", "Tom", "Simpkin", "Delta"],
|
418
|
+
"age": [3, 2, 5, 12, 4],
|
419
|
+
"job": ["Tour Guide", "Drifter", "Housekeeper", "Mouser", "Engineer"]
|
418
420
|
})
|
419
421
|
|
420
|
-
#
|
421
|
-
# 1.
|
422
|
-
# 2.
|
423
|
-
|
422
|
+
# DeltaCAT automatically evolves the schema and merges by user_id:
|
423
|
+
# 1. Enriches existing users (Jim -> Cheshire age=3, job="Tour Guide"; Bob -> Felix)
|
424
|
+
# 2. Adds new age/job columns with automatic schema evolution
|
425
|
+
# 3. Inserts new users (Tom, Simpkin, Delta) with full feature set
|
426
|
+
dc.write(enriched_data, "users")
|
424
427
|
|
425
|
-
# Read back to see
|
428
|
+
# Read back to see live feature enrichment results
|
426
429
|
df = dc.read("users", read_as=dc.DatasetType.PANDAS)
|
427
|
-
print("\n===
|
430
|
+
print("\n=== Enriched Users (Age & Job) ===")
|
428
431
|
print(df.sort_values("user_id"))
|
429
432
|
|
430
|
-
# - Cheshire (user_id=1)
|
431
|
-
# - Dinah (user_id=2)
|
432
|
-
# - Felix (user_id=3) updated
|
433
|
-
# - New users (4,5,6)
|
434
|
-
# -
|
433
|
+
# - Cheshire (user_id=1) name updated from Jim, gets age=3, job="Tour Guide"
|
434
|
+
# - Dinah (user_id=2) keeps original name, gets null age/job (missing features)
|
435
|
+
# - Felix (user_id=3) name updated from Bob, gets age=2, job="Drifter"
|
436
|
+
# - New users (4,5,6) added with complete feature set
|
437
|
+
# - Schema automatically evolved to include age/job columns
|
435
438
|
|
436
439
|
# Specify the users to delete.
|
437
440
|
# We only need to specify matching merge key values.
|
@@ -440,7 +443,7 @@ users_to_delete = pd.DataFrame({
|
|
440
443
|
})
|
441
444
|
|
442
445
|
# Delete the records that match our merge keys.
|
443
|
-
dc.write(users_to_delete, "users",
|
446
|
+
dc.write(users_to_delete, "users", mode=dc.TableWriteMode.DELETE)
|
444
447
|
|
445
448
|
# Read the table back to confirm target users have been deleted.
|
446
449
|
df = dc.read("users", read_as=dc.DatasetType.PANDAS)
|
@@ -456,6 +459,117 @@ print(df.sort_values("user_id"))
|
|
456
459
|
|
457
460
|
<details>
|
458
461
|
|
462
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Zero-Copy Multimodal URL Processing</span></summary>
|
463
|
+
|
464
|
+
DeltaCAT can register and process existing multimodal datasets from local or remote URLs. This enables zero-copy distributed processing of images, audio, text, and other file formats:
|
465
|
+
|
466
|
+
```python
|
467
|
+
import deltacat as dc
|
468
|
+
import pandas as pd
|
469
|
+
import pyarrow as pa
|
470
|
+
import tempfile
|
471
|
+
import ray
|
472
|
+
|
473
|
+
# Initialize DeltaCAT with a fresh temporary catalog
|
474
|
+
dc.init_local(tempfile.mkdtemp())
|
475
|
+
|
476
|
+
# Create dataset with DeltaCAT URLs pointing to existing files
|
477
|
+
urls_df = pd.DataFrame({
|
478
|
+
"file_id": [1, 2, 3, 4, 5, 6],
|
479
|
+
"url": [
|
480
|
+
# URLs with common file extensions will have their content type inferred.
|
481
|
+
"https://picsum.photos/id/237/400/300.jpg",
|
482
|
+
"https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv",
|
483
|
+
"https://raw.githubusercontent.com/SergLam/Audio-Sample-files/master/sample.mp3",
|
484
|
+
"https://raw.githubusercontent.com/burningtree/awesome-json/master/README.md",
|
485
|
+
"https://raw.githubusercontent.com/microsoft/vscode/main/package.json",
|
486
|
+
# URLs without common file extensions will be read as binary by default.
|
487
|
+
"https://picsum.photos/200"
|
488
|
+
]
|
489
|
+
})
|
490
|
+
|
491
|
+
# Create empty table with merge key to efficiently add insights about each file
|
492
|
+
dc.create_table(
|
493
|
+
"multimodal_files",
|
494
|
+
schema=dc.Schema.of([
|
495
|
+
dc.Field.of(pa.field("file_id", pa.int64()), is_merge_key=True),
|
496
|
+
dc.Field.of(pa.field("url", pa.string()))
|
497
|
+
])
|
498
|
+
)
|
499
|
+
|
500
|
+
# Write URLs to DeltaCAT table
|
501
|
+
dc.write(urls_df, "multimodal_files")
|
502
|
+
|
503
|
+
# UDF to process each file in parallel using Ray Dataset map method
|
504
|
+
def analyze_file(row):
|
505
|
+
file_id = row["file_id"]
|
506
|
+
url = row["url"]
|
507
|
+
|
508
|
+
# DeltaCAT automatically infers the right Ray Data reader for the URL
|
509
|
+
dataset = dc.get(url)
|
510
|
+
records = dataset.take_all()
|
511
|
+
url_type = dc.DatastoreType.from_url(url)
|
512
|
+
|
513
|
+
# Extract standard Ray Dataset fields for each file type
|
514
|
+
if url_type == dc.DatastoreType.IMAGES:
|
515
|
+
image = records[0]["image"]
|
516
|
+
analysis = f"Image {image.shape[1]}x{image.shape[0]} pixels"
|
517
|
+
elif url_type == dc.DatastoreType.CSV:
|
518
|
+
analysis = f"CSV with {len(records)} rows, {len(records[0].keys())} columns"
|
519
|
+
elif url_type == dc.DatastoreType.AUDIO:
|
520
|
+
sample_rate = records[0]["sample_rate"]
|
521
|
+
duration = len(records[0]["amplitude"][0]) / sample_rate
|
522
|
+
analysis = f"Audio {duration:.1f}s, {sample_rate}Hz"
|
523
|
+
elif url_type == dc.DatastoreType.JSON:
|
524
|
+
analysis = f"JSON with {len(records[0].keys())} fields"
|
525
|
+
elif url_type == dc.DatastoreType.TEXT:
|
526
|
+
analysis = f"Text with {len(records)} records"
|
527
|
+
else:
|
528
|
+
analysis = f"Binary with {len(records[0]['bytes'])} bytes"
|
529
|
+
|
530
|
+
return {"file_id": file_id, "analysis": analysis}
|
531
|
+
|
532
|
+
# Read the multimodal_files table as a Ray Dataset
|
533
|
+
ray_dataset = dc.read("multimodal_files", read_as=dc.DatasetType.RAY_DATASET)
|
534
|
+
# Download and analyze each URL in parallel using map
|
535
|
+
results_dataset = ray_dataset.map(analyze_file)
|
536
|
+
|
537
|
+
# Write results back to the multimodal_files table
|
538
|
+
dc.write(results_dataset, "multimodal_files", mode=dc.TableWriteMode.MERGE)
|
539
|
+
|
540
|
+
# Read final results and compare to initial dataset
|
541
|
+
print("\n=== Initial Dataset ===")
|
542
|
+
print(dc.to_pandas(ray_dataset))
|
543
|
+
|
544
|
+
print("\n=== Final Results with Analysis ===")
|
545
|
+
print(dc.read("multimodal_files", read_as=dc.DatasetType.PANDAS))
|
546
|
+
```
|
547
|
+
|
548
|
+
The default dataset type used by `dc.get` is a Ray Dataset but, similar to `dc.read`, `dc.get` can also read URLs into other dataset types like Daft:
|
549
|
+
|
550
|
+
```python
|
551
|
+
import deltacat as dc
|
552
|
+
|
553
|
+
# Create dataset with DeltaCAT URLs pointing to existing files
|
554
|
+
urls = [
|
555
|
+
# URLs with common file extensions will have their content type inferred.
|
556
|
+
"https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv",
|
557
|
+
"https://raw.githubusercontent.com/burningtree/awesome-json/master/README.md",
|
558
|
+
# URLs without common file extensions will be read as binary by default.
|
559
|
+
"https://picsum.photos/200"
|
560
|
+
]
|
561
|
+
|
562
|
+
# Download each URL into a Daft DataFrame serially
|
563
|
+
for url in urls:
|
564
|
+
dataset = dc.get(url, read_as=dc.DatasetType.DAFT)
|
565
|
+
print(f"\n=== {url} ===")
|
566
|
+
print(dataset.show())
|
567
|
+
```
|
568
|
+
|
569
|
+
</details>
|
570
|
+
|
571
|
+
<details>
|
572
|
+
|
459
573
|
<summary><span style="font-size: 1.25em; font-weight: bold;">Organizing Tables with Namespaces</span></summary>
|
460
574
|
|
461
575
|
In DeltaCAT, table **Namespaces** are optional but useful for organizing related tables within a catalog:
|
@@ -534,9 +648,9 @@ print(finance_df)
|
|
534
648
|
|
535
649
|
<details>
|
536
650
|
|
537
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
651
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Data Lake Level Transactions</span></summary>
|
538
652
|
|
539
|
-
DeltaCAT transactions can span multiple tables and namespaces. Since all operations within a transaction either succeed or fail together, this simplifies keeping related datasets in sync across your entire catalog.
|
653
|
+
DeltaCAT transactions can span multiple tables and namespaces. Since transaction history is maintained at the catalog level, every transaction operates against a consistent snapshot of every object in your data lake. Since all operations within a transaction either succeed or fail together, this simplifies keeping related datasets in sync across your entire catalog.
|
540
654
|
|
541
655
|
Consider the previous example that organized tables with namespaces. One table tracked customer orders, and another table tracked the lifetime payments of each customer. If one table was updated but not the other, then it would result in an accounting discrepancy. This edge case can be eliminated by using multi-table transactions:
|
542
656
|
|
@@ -630,7 +744,7 @@ print(dc.read("users", namespace="finance", read_as=dc.DatasetType.PANDAS))
|
|
630
744
|
|
631
745
|
<details>
|
632
746
|
|
633
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
747
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Managing Multiple Data Lakes</span></summary>
|
634
748
|
|
635
749
|
DeltaCAT lets you work with multiple catalogs in a single application. All catalogs registered with DeltaCAT are tracked by a Ray Actor to make them available to all workers in your Ray application.
|
636
750
|
|
@@ -652,8 +766,8 @@ dc.init(catalogs={
|
|
652
766
|
filesystem=pa.fs.LocalFileSystem()
|
653
767
|
)),
|
654
768
|
"prod": dc.Catalog(config=dc.CatalogProperties(
|
655
|
-
root=
|
656
|
-
filesystem=pa.fs.
|
769
|
+
root="s3://example/deltacat/", # Use S3 for prod
|
770
|
+
filesystem=pa.fs.S3FileSystem()
|
657
771
|
))
|
658
772
|
})
|
659
773
|
|
@@ -705,9 +819,9 @@ print(dc.read("financial_data", catalog="prod", read_as=dc.DatasetType.PANDAS))
|
|
705
819
|
|
706
820
|
<details>
|
707
821
|
|
708
|
-
<summary><span style="font-size: 1.25em; font-weight: bold;">
|
822
|
+
<summary><span style="font-size: 1.25em; font-weight: bold;">Data Lake Level Time Travel</span></summary>
|
709
823
|
|
710
|
-
DeltaCAT supports time travel queries that let you read all tables in a catalog as they existed at any point in the past. Combined with
|
824
|
+
DeltaCAT supports time travel queries that let you read all tables in a catalog as they existed at any point in the past. Combined with catalog-level transactions, this enables consistent point-in-time views across your entire data lake.
|
711
825
|
|
712
826
|
```python
|
713
827
|
import deltacat as dc
|
@@ -847,7 +961,7 @@ print("\nTime travel validation successful!")
|
|
847
961
|
|
848
962
|
<summary><span style="font-size: 1.25em; font-weight: bold;">Multimodal Batch Inference</span></summary>
|
849
963
|
|
850
|
-
DeltaCAT's support for merging new fields into existing records and multimodal datasets can be used to build a multimodal batch inference pipeline. For example, the following code indexes images of cats, then merges in new fields with breed
|
964
|
+
DeltaCAT's support for merging new fields into existing records and multimodal datasets can be used to build a multimodal batch inference pipeline. For example, the following code indexes images of cats, then merges in new fields with breed predictions for each image:
|
851
965
|
|
852
966
|
> **Requirements**: This example requires PyTorch ≥ 2.8.0 and torchvision ≥ 0.23.0. Install via: `pip install torch>=2.8.0 torchvision>=0.23.0`
|
853
967
|
|
@@ -938,7 +1052,7 @@ final_df.show()
|
|
938
1052
|
|
939
1053
|
<summary><span style="font-size: 1.25em; font-weight: bold;">LLM Batch Inference</span></summary>
|
940
1054
|
|
941
|
-
DeltaCAT multi-table transactions, time travel
|
1055
|
+
DeltaCAT multi-table transactions, data lake time travel, and automatic schema evolution can be used to create auditable LLM batch inference pipelines. For example, the following code tries different approaches to analyze the overall tone of customer feedback, then generates customer service responses based on the analysis:
|
942
1056
|
|
943
1057
|
```python
|
944
1058
|
import deltacat as dc
|
@@ -1,6 +1,6 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=8oHmukh7qFhVfUT89l4zBtonbu_wsoj2hJsPaka0PoA,4452
|
2
2
|
deltacat/annotations.py,sha256=9lBi34DpIV_RPjCCK2Aiz_6nMyd-e-_CfQ1XtdRQQlM,1196
|
3
|
-
deltacat/api.py,sha256=
|
3
|
+
deltacat/api.py,sha256=W7u3jeKZsvTWNi9zkhOC2O6BMwZ3TvMRbnbpcn6lFBo,21940
|
4
4
|
deltacat/constants.py,sha256=HPE3SbK1-LRjtTu3OKD9s4N__LWMwj3xFP2N3Qy8fzM,4701
|
5
5
|
deltacat/env.py,sha256=BJdTt8od3IVR4RMLjBxy4oRUHM7Lb16AzMOz8-hpwOI,2303
|
6
6
|
deltacat/exceptions.py,sha256=dqZizcMKC3VwO7EgHXdAC4YUivBKVJgNwQLibMP93MA,16051
|
@@ -23,7 +23,7 @@ deltacat/catalog/__init__.py,sha256=lsu9N2G6P6HkyvrIpGY34SVkJM8-lwVaNfZanNTRjAc,
|
|
23
23
|
deltacat/catalog/delegate.py,sha256=RDOQHaYvpvwc3RTZNaJhv00yXV1WHgE8YcD4i19H6g0,26870
|
24
24
|
deltacat/catalog/interface.py,sha256=rmJSVi8dNORVa0ydzRFRwMcbpXwhDjYEpGAIGi-4O08,18486
|
25
25
|
deltacat/catalog/main/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
|
-
deltacat/catalog/main/impl.py,sha256=
|
26
|
+
deltacat/catalog/main/impl.py,sha256=V-JeQkAIQOVwbupJ_-2sSMUQ5P12crcWF3qArFVHMg8,100224
|
27
27
|
deltacat/catalog/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
28
|
deltacat/catalog/model/catalog.py,sha256=gvfczu9yhvDIjPjx5ZE69IUu1I_nhZOHURsOAakzhcQ,12765
|
29
29
|
deltacat/catalog/model/properties.py,sha256=Bt7JgmG9UQD9ABqrCXniGrbRWpYWbini9ZCY8dBhifU,5416
|
@@ -132,7 +132,7 @@ deltacat/docs/autogen/schema/inference/generate_type_mappings.py,sha256=ZH30xcsA
|
|
132
132
|
deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py,sha256=_N37jw5nmNlf40V2mOjDcXdJNhm1qoEa_fQdz_XRk1c,28929
|
133
133
|
deltacat/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
134
134
|
deltacat/examples/basic_logging.py,sha256=Umrum-gvY3gJjDNJ4hOMslMMq9bzeTM-s_DO4dGqJiw,2833
|
135
|
-
deltacat/examples/hello_world.py,sha256=
|
135
|
+
deltacat/examples/hello_world.py,sha256=dm4GNvNL_HElPtE50sZzaZFrV48BcRL89nZp9SnLSIw,799
|
136
136
|
deltacat/examples/compactor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
137
137
|
deltacat/examples/compactor/bootstrap.py,sha256=6BXDWsvH3QuSDmd31Wc0I4_qLy9lZTW4_029MGRslzA,35126
|
138
138
|
deltacat/examples/compactor/compactor.py,sha256=_FbM9paIly4JK3FYP3t5nDPNL98I6K9UbhidachNaAE,12431
|
@@ -155,8 +155,8 @@ deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py,sha256=N
|
|
155
155
|
deltacat/examples/experimental/iceberg/converter/beam/utils/common.py,sha256=wrUk-8sojz4sudZPMzCHyNVLsw1opBg23C9_q6z8AhA,6388
|
156
156
|
deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py,sha256=CAMzNgeDDt4UKVTnUCEu8oRTB57rjBUwK6MxLLO3GBA,10046
|
157
157
|
deltacat/examples/indexer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
|
-
deltacat/examples/indexer/indexer.py,sha256=
|
159
|
-
deltacat/examples/indexer/job_runner.py,sha256=
|
158
|
+
deltacat/examples/indexer/indexer.py,sha256=7SqMfzte-PzSKcOsVQ9k-F9dODio70yzU0M0S3CldH8,6553
|
159
|
+
deltacat/examples/indexer/job_runner.py,sha256=Xwm6raw-Bx_Gq-8uMcw8ohdja2L6HgDlBLwCWDsRnbg,6398
|
160
160
|
deltacat/examples/indexer/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
161
161
|
deltacat/examples/indexer/gcp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
162
162
|
deltacat/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -251,7 +251,7 @@ deltacat/storage/model/manifest.py,sha256=NnriTMm6waxixsjykfTABh4EWf985tA-A19AA2
|
|
251
251
|
deltacat/storage/model/metafile.py,sha256=g-dgFX5fmW51EGhjiN5jpHR1LTOhf0jhUIMBapRD1Rw,58619
|
252
252
|
deltacat/storage/model/namespace.py,sha256=9V1Qj232uc_UrVzZPIRzXyeYhJOYZ25wPLCx15-dx1Y,2630
|
253
253
|
deltacat/storage/model/partition.py,sha256=UcHcBQV0Kf_RnVIFzoYQo8MUdOrjZdWsaiGUv8FKXx4,24298
|
254
|
-
deltacat/storage/model/schema.py,sha256=
|
254
|
+
deltacat/storage/model/schema.py,sha256=bGtrm3xB0cr20HRICcRE6vQ5JKaTTrPGKIQ2cPaaWC8,122763
|
255
255
|
deltacat/storage/model/shard.py,sha256=boPOW45bwLwBazfXZpa3-C5SUSlgelpHf8Yl6357Bq0,1575
|
256
256
|
deltacat/storage/model/sort_key.py,sha256=68TJavprndKLESnWfCjXaeMwFE6tcq3ZVOHloE9rV6Q,7287
|
257
257
|
deltacat/storage/model/stream.py,sha256=VJgqVy4NS6IHLBLRf5OyehldZbIrarqGZYN07XF4Yp4,12609
|
@@ -285,10 +285,10 @@ deltacat/tests/_io/reader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
285
285
|
deltacat/tests/_io/reader/test_deltacat_read_api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
286
286
|
deltacat/tests/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
287
287
|
deltacat/tests/aws/test_clients.py,sha256=23GMWfz27WWBDXSqphG9mfputsyS7j3I5P_HRk4YoKE,3790
|
288
|
-
deltacat/tests/aws/test_s3u.py,sha256=
|
288
|
+
deltacat/tests/aws/test_s3u.py,sha256=M27w8BDbv638ReYtr5kA2eXcd3xJmRwfOYxLV0tax_s,7268
|
289
289
|
deltacat/tests/catalog/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
290
290
|
deltacat/tests/catalog/test_catalogs.py,sha256=MFS_fISQq7VGzjaVVM1K9O_QSE-02SN0gHwUtQ7m-HU,11696
|
291
|
-
deltacat/tests/catalog/test_default_catalog_impl.py,sha256=
|
291
|
+
deltacat/tests/catalog/test_default_catalog_impl.py,sha256=_YiLAHCxECd_lQKaFu22qlH1dDWMQmxo6qRkpxuxPrI,476575
|
292
292
|
deltacat/tests/catalog/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
293
293
|
deltacat/tests/catalog/main/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
294
294
|
deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py,sha256=XvMoW1yjjoIs4-0A8_dqeY7ArysN6HJkSSHk7JnHeUI,4313
|
@@ -404,9 +404,9 @@ deltacat/tests/utils/ray_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
|
|
404
404
|
deltacat/tests/utils/ray_utils/test_concurrency.py,sha256=TjZpX0cjMDEIS79p_--j_BfT0zXKNkTLY1ZzNokBTs0,1211
|
405
405
|
deltacat/tests/utils/ray_utils/test_dataset.py,sha256=glfihM4FBqqIWcW5SdU-SYqhmeMIPfl8Krfzj0oEviI,6418
|
406
406
|
deltacat/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
407
|
-
deltacat/types/media.py,sha256=
|
407
|
+
deltacat/types/media.py,sha256=yIWs6Wcb00bnZeQvwdpQoXYbjvw4BL81dWZU50XE23g,22317
|
408
408
|
deltacat/types/partial_download.py,sha256=QIpNTSwaiZ4TVl4A1N4PtblevKT5GwdXtGrouQMQs1E,2510
|
409
|
-
deltacat/types/tables.py,sha256=
|
409
|
+
deltacat/types/tables.py,sha256=SrnPQB2-VFxkUSSRIDnImVhV39OnFJYHpZ3yH9Y3BI8,85083
|
410
410
|
deltacat/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
411
411
|
deltacat/utils/arguments.py,sha256=WzEjt8N4rXE5Vkcirz18ppZguBENxYm8F8m97LshV1Y,2162
|
412
412
|
deltacat/utils/cloudpickle.py,sha256=XE7YDmQe56ksfl3NdYZkzOAhbHSuhNcBZGOehQpgZr0,1187
|
@@ -417,23 +417,23 @@ deltacat/utils/filesystem.py,sha256=jQ_vY0lBJcSuKqSOjSwB7q-s52ckYhJSvnGT_aYZvUc,
|
|
417
417
|
deltacat/utils/metafile_locator.py,sha256=AJ6o2V5Cc7rJE89wWyKmsFIWOxmGM2APs8DCynmuTjg,2984
|
418
418
|
deltacat/utils/metrics.py,sha256=HYKyZSrtVLu8gXezg_TMNUKJp4h1WWI0VEzn0Xlzf-I,10778
|
419
419
|
deltacat/utils/numpy.py,sha256=tgq4j_9q9bERxsr0-h3t55BrciS2ivr1AZe7R1DldkA,5524
|
420
|
-
deltacat/utils/pandas.py,sha256=
|
420
|
+
deltacat/utils/pandas.py,sha256=v8wS_pArpyVJ1p3oYFs2uy6qt38httUYXfVHdAGlPso,31418
|
421
421
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
422
422
|
deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
|
423
|
-
deltacat/utils/polars.py,sha256
|
424
|
-
deltacat/utils/pyarrow.py,sha256=
|
423
|
+
deltacat/utils/polars.py,sha256=-_6CGDhhZ_g8Z2sTc_GBWvHSNXNWOszitxYeckrOQ9g,28906
|
424
|
+
deltacat/utils/pyarrow.py,sha256=i3__I5c1UCEjG8N1i2szWc2vHYUM4Hz1T1g1-mfQYfw,74410
|
425
425
|
deltacat/utils/reader_compatibility_mapping.py,sha256=fZcNdw4kamkQF-ZzvBC4Zp_sbjxp0yOVIhLgV6V2Ee8,91409
|
426
426
|
deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
|
427
427
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
428
|
-
deltacat/utils/url.py,sha256=
|
428
|
+
deltacat/utils/url.py,sha256=H9L6Pgr8MNtUqFBwPAygogbZ2mZ1Het308J7mu0kpyQ,45690
|
429
429
|
deltacat/utils/ray_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
430
430
|
deltacat/utils/ray_utils/collections.py,sha256=hj20s4D2RF2jZETU_44r6mFbsczA0JI_I_4kWKTmqes,1951
|
431
431
|
deltacat/utils/ray_utils/concurrency.py,sha256=Ceui6nQYKHTUOTltHNQIdb0OWHFhD73o8DhSXP-DYRQ,5457
|
432
432
|
deltacat/utils/ray_utils/dataset.py,sha256=5RnVqFlKoZ6zabnQfjfXAKWuXDMKvLp4eNcDgpFj3OM,6480
|
433
433
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
434
434
|
deltacat/utils/ray_utils/runtime.py,sha256=cf5koY9q4TzRg--BjPtC6y0jztq45F39KcC4K6Wmg4w,6946
|
435
|
-
deltacat-2.0.0.
|
436
|
-
deltacat-2.0.0.
|
437
|
-
deltacat-2.0.0.
|
438
|
-
deltacat-2.0.0.
|
439
|
-
deltacat-2.0.0.
|
435
|
+
deltacat-2.0.0.post2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
436
|
+
deltacat-2.0.0.post2.dist-info/METADATA,sha256=lIEs05JT0ZWkYJoM2JPh77nkvFhgVQ2yyieQPxA0Ofo,52106
|
437
|
+
deltacat-2.0.0.post2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
438
|
+
deltacat-2.0.0.post2.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
439
|
+
deltacat-2.0.0.post2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|