deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1163 @@
1
+ Metadata-Version: 2.4
2
+ Name: deltacat
3
+ Version: 2.0.0b12
4
+ Summary: A portable, scalable, fast, and Pythonic Data Lakehouse for AI.
5
+ Home-page: https://github.com/ray-project/deltacat
6
+ Author: Ray Team
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Programming Language :: Python :: 3 :: Only
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.9
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: aws-embedded-metrics==3.2.0
17
+ Requires-Dist: boto3~=1.34
18
+ Requires-Dist: google-cloud-storage
19
+ Requires-Dist: gcsfs==2025.3.2
20
+ Requires-Dist: daft==0.4.15
21
+ Requires-Dist: intervaltree==3.1.0
22
+ Requires-Dist: numpy==1.22.4
23
+ Requires-Dist: pandas==2.2.3
24
+ Requires-Dist: polars==1.28.1
25
+ Requires-Dist: pyarrow==16.0.0
26
+ Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3
27
+ Requires-Dist: pymemcache==4.0.0
28
+ Requires-Dist: ray[default]==2.46.0
29
+ Requires-Dist: tenacity==8.2.3
30
+ Requires-Dist: typing-extensions==4.6.1
31
+ Requires-Dist: redis==5.0.0
32
+ Requires-Dist: schedule==1.2.0
33
+ Provides-Extra: iceberg
34
+ Requires-Dist: pyiceberg[glue]>=0.9.0; extra == "iceberg"
35
+ Requires-Dist: pyiceberg[hive]>=0.9.0; extra == "iceberg"
36
+ Requires-Dist: pyiceberg[sql-sqlite]>=0.9.0; extra == "iceberg"
37
+ Provides-Extra: beam
38
+ Requires-Dist: apache-beam==2.65.0; extra == "beam"
39
+ Provides-Extra: s3fs
40
+ Requires-Dist: s3fs==2025.3.2; extra == "s3fs"
41
+ Dynamic: author
42
+ Dynamic: classifier
43
+ Dynamic: description
44
+ Dynamic: description-content-type
45
+ Dynamic: home-page
46
+ Dynamic: license-file
47
+ Dynamic: provides-extra
48
+ Dynamic: requires-dist
49
+ Dynamic: requires-python
50
+ Dynamic: summary
51
+
52
+ <p align="center">
53
+ <img src="media/deltacat-logo-alpha-750.png" alt="deltacat logo" style="width:55%; height:auto; text-align: center;">
54
+ </p>
55
+
56
+ DeltaCAT is a portable Pythonic Data Lakehouse powered by [Ray](https://github.com/ray-project/ray). It lets you define and manage
57
+ fast, scalable, ACID-compliant multimodal data lakes, and has been used to [successfully manage exabyte-scale enterprise
58
+ data lakes](https://aws.amazon.com/blogs/opensource/amazons-exabyte-scale-migration-from-apache-spark-to-ray-on-amazon-ec2/).
59
+
60
+ It uses the Ray distributed compute framework together with [Apache Arrow](https://github.com/apache/arrow) and
61
+ [Daft](https://github.com/Eventual-Inc/Daft) to efficiently scale common table management tasks, like petabyte-scale
62
+ merge-on-read and copy-on-write operations.
63
+
64
+ DeltaCAT provides the following high-level components:
65
+ 1. [**Catalog**](deltacat/catalog/interface.py): High-level APIs to create, discover, organize, share, and manage datasets.
66
+ 2. [**Compute**](deltacat/compute/): Distributed data management procedures to read, write, and optimize datasets.
67
+ 3. [**Storage**](deltacat/storage/): In-memory and on-disk multimodal dataset formats.
68
+ 4. **Sync** (in development): Synchronize DeltaCAT datasets to data warehouses and other table formats.
69
+
70
+ ## Overview
71
+ DeltaCAT's **Catalog**, **Compute**, and **Storage** layers work together to bring ACID-compliant data management to any Ray application. These components automate data indexing, change management, dataset read/write optimization, schema evolution, and other common data management tasks across any set of data files readable by Ray Data, Daft, Pandas, Polars, PyArrow, or NumPy.
72
+
73
+ <p align="center">
74
+ <img src="media/deltacat-tech-overview.png" alt="deltacat tech overview" style="width:100%; height:auto; text-align: center;">
75
+ </p>
76
+
77
+ Data consumers that prefer to stay within the ecosystem of Pythonic data management tools can use DeltaCAT's native table format to manage their data with minimal overhead. For integration with other data analytics frameworks (e.g., Apache Spark, Trino, Apache Flink), DeltaCAT's **Sync** component offers zero-copy synchronization of your tables to Apache Iceberg and other table formats.
78
+
79
+ ## Getting Started
80
+ DeltaCAT applications run anywhere that Ray runs, including your local laptop, cloud computing cluster, or on-premise cluster.
81
+
82
+ DeltaCAT lets you manage **Tables** across one or more **Catalogs**. A **Table** can be thought of as a named collection of one or more data files. A **Catalog** provides a root location (e.g., a local file path or S3 Bucket) to store table information, and can be rooted in any [PyArrow-compatible Filesystem](https://arrow.apache.org/docs/python/filesystems.html). **Tables** can be created, read, and written using the `dc.write` and `dc.read` APIs.
83
+
84
+ ### Quick Start
85
+
86
+ ```python
87
+ import deltacat as dc
88
+ import pandas as pd
89
+
90
+ # Initialize DeltaCAT with a default local catalog.
91
+ # Ray will be initialized automatically.
92
+ # Catalog files will be stored in .deltacat/ in the current working directory.
93
+ dc.init_local()
94
+
95
+ # Create data to write.
96
+ data = pd.DataFrame({
97
+ "id": [1, 2, 3],
98
+ "name": ["Cheshire", "Dinah", "Felix"],
99
+ "age": [3, 7, 5]
100
+ })
101
+
102
+ # Write data to a table.
103
+ # Table creation is handled automatically.
104
+ dc.write(data, "users")
105
+
106
+ # Read the data back as a Daft DataFrame.
107
+ # Daft lazily and automatically distributes data across your Ray cluster.
108
+ daft_df = dc.read("users") # Returns Daft DataFrame (default)
109
+ daft_df.show() # Materialize and print the DataFrame
110
+
111
+ # Append more data and add a new column.
112
+ # Compaction and schema evolution are handled automatically.
113
+ data = pd.DataFrame({
114
+ "id": [4, 5, 6],
115
+ "name": ["Tom", "Simpkin", "Delta"],
116
+ "age": [2, 12, 4],
117
+ "city": ["Hollywood", "Gloucester", "San Francisco"]
118
+ })
119
+ dc.write(data, "users")
120
+
121
+ # Read the full table back into a Daft DataFrame.
122
+ daft_df = dc.read("users")
123
+ # Print the names, ages, and cities (missing cities will be null).
124
+ daft_df.select("name", "age", "city").show()
125
+ ```
126
+
127
+ ### Core Concepts
128
+ DeltaCAT can do much more than just append data to tables and read it back again. Expand the sections below to see examples of other core DeltaCAT concepts and APIs.
129
+
130
+ <details>
131
+
132
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Replacing and Dropping Tables</span></summary>
133
+
134
+ If you run the quick start example repeatedly from the same working directory, you'll notice that the table it writes to just keeps growing larger. This is because DeltaCAT always **appends** table data by default. One way to prevent this perpetual table growth and make the example idempotent is to use the **REPLACE** write mode if the table already exists:
135
+
136
+ ```python
137
+ import deltacat as dc
138
+ import pandas as pd
139
+
140
+ # Initialize DeltaCAT with a default local catalog.
141
+ # Ray will be initialized automatically.
142
+ # Catalog files will be stored in .deltacat/ in the current working directory.
143
+ dc.init_local()
144
+
145
+ # Create data to write.
146
+ data = pd.DataFrame({
147
+ "id": [1, 2, 3],
148
+ "name": ["Cheshire", "Dinah", "Felix"],
149
+ "age": [3, 7, 5]
150
+ })
151
+
152
+ # Default write mode to CREATE.
153
+ # This will fail if the table already exists.
154
+ write_mode = dc.TableWriteMode.CREATE
155
+
156
+ # Change write mode to REPLACE if the table already exists.
157
+ if dc.table_exists("users"):
158
+ write_mode = dc.TableWriteMode.REPLACE
159
+
160
+ # Write data to a fresh, empty table.
161
+ dc.write(data, "users", mode=write_mode)
162
+
163
+ # Read the data back as a Daft DataFrame.
164
+ # Daft lazily and automatically distributes data across your Ray cluster.
165
+ daft_df = dc.read("users") # Returns Daft DataFrame (default)
166
+ daft_df.show() # Materialize and print the DataFrame
167
+
168
+ # Explicitly append more data and add a new column.
169
+ # Compaction and schema evolution are handled automatically.
170
+ data = pd.DataFrame({
171
+ "id": [4, 5, 6],
172
+ "name": ["Tom", "Simpkin", "Delta"],
173
+ "age": [2, 12, 4],
174
+ "city": ["Hollywood", "Gloucester", "San Francisco"]
175
+ })
176
+ dc.write(data, "users", mode=dc.TableWriteMode.APPEND)
177
+
178
+ # Read the full table back into a Daft DataFrame.
179
+ daft_df = dc.read("users")
180
+ # Print the names, ages, and cities (missing cities will be null).
181
+ daft_df.select("name", "age", "city").show()
182
+ # Ensure that the table length is always 6.
183
+ assert dc.dataset_length(daft_df) == 6
184
+ ```
185
+
186
+ No matter how many times you run the above code, the table will always contain 6 records. Another way to achieve the same result is to use `dc.drop_table`:
187
+
188
+ ```python
189
+ import deltacat as dc
190
+ from deltacat.exceptions import TableNotFoundError
191
+ import pandas as pd
192
+
193
+ # Initialize DeltaCAT with a default local catalog.
194
+ # Ray will be initialized automatically.
195
+ # Catalog files will be stored in .deltacat/ in the current working directory.
196
+ dc.init_local()
197
+
198
+ # Create data to write.
199
+ data = pd.DataFrame({
200
+ "id": [1, 2, 3],
201
+ "name": ["Cheshire", "Dinah", "Felix"],
202
+ "age": [3, 7, 5]
203
+ })
204
+
205
+ # Drop the table if it exists.
206
+ try:
207
+ dc.drop_table("users")
208
+ print("Dropped 'users' table.")
209
+ except TableNotFoundError:
210
+ print("Table 'users' not found. Creating it...")
211
+
212
+ # Write data to a new table.
213
+ dc.write(data, "users", mode=dc.TableWriteMode.CREATE)
214
+
215
+ # Read the data back as a Daft DataFrame.
216
+ # Daft lazily and automatically distributes data across your Ray cluster.
217
+ daft_df = dc.read("users") # Returns Daft DataFrame (default)
218
+ daft_df.show() # Materialize and print the DataFrame
219
+
220
+ # Explicitly append more data and add a new column.
221
+ # Compaction and schema evolution are handled automatically.
222
+ data = pd.DataFrame({
223
+ "id": [4, 5, 6],
224
+ "name": ["Tom", "Simpkin", "Delta"],
225
+ "age": [2, 12, 4],
226
+ "city": ["Hollywood", "Gloucester", "San Francisco"]
227
+ })
228
+ dc.write(data, "users", mode=dc.TableWriteMode.APPEND)
229
+
230
+ # Read the full table back into a Daft DataFrame.
231
+ daft_df = dc.read("users")
232
+ # Print the names, ages, and cities (missing cities will be null).
233
+ daft_df.select("name", "age", "city").show()
234
+ # Ensure that the table length is always 6.
235
+ assert dc.dataset_length(daft_df) == 6
236
+ ```
237
+
238
+ </details>
239
+
240
+ <details>
241
+
242
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Supported Dataset and File Formats</span></summary>
243
+
244
+ DeltaCAT natively supports a variety of open dataset and file formats already integrated with Ray and Arrow. You can use `dc.read` to read tables back as a Daft DataFrame, Ray Dataset, Pandas DataFrame, PyArrow Table, Polars DataFrame, NumPy Array, or list of PyArrow ParquetFile objects:
245
+
246
+ ```python
247
+ # Read directly into eagerly materialized local datasets.
248
+ # Local datasets are best for reading small tables that fit in local memory:
249
+ pandas_df = dc.read("users", read_as=dc.DatasetType.PANDAS) # Pandas DataFrame
250
+ print("\n=== Pandas ===")
251
+ print(pandas_df)
252
+
253
+ pyarrow_table = dc.read("users", read_as=dc.DatasetType.PYARROW) # PyArrow Table
254
+ print("\n=== PyArrow ===")
255
+ print(pyarrow_table)
256
+
257
+ polars_df = dc.read("users", read_as=dc.DatasetType.POLARS) # Polars DataFrame
258
+ print("\n=== Polars ===")
259
+ print(polars_df)
260
+
261
+ numpy_array = dc.read("users", read_as=dc.DatasetType.NUMPY) # NumPy Array
262
+ print("\n=== NumPy ===")
263
+ print(numpy_array)
264
+
265
+ # Or read into lazily materialized PyArrow ParquetFile objects.
266
+ # PyArrow ParquetFile objects are useful for reading larger tables that don't
267
+ # fit in local memory, but you'll need to manually handle data distribution
268
+ # and materialization:
269
+ pyarrow_pq_files = dc.read("users", read_as=dc.DatasetType.PYARROW_PARQUET) # ParquetFile or List[ParquetFile]
270
+ print("\n=== PyArrow Parquet Unmaterialized ===")
271
+ print(pyarrow_pq_files)
272
+ print("\n=== PyArrow Parquet Materialized ===")
273
+ print(dc.to_pyarrow(pyarrow_pq_files)) # Materialize and print the ParquetFile refs
274
+
275
+ # Or read into distributed datasets for scalable data processing.
276
+ # Distributed datasets are the easiest way to read large tables that don't fit
277
+ # in either local or distributed Ray cluster memory. They automatically handle
278
+ # data distribution and materialization:
279
+ daft_df = dc.read("users", read_as=dc.DatasetType.DAFT) # Daft DataFrame (Default)
280
+ print("\n=== Daft ===")
281
+ daft_df.show() # Materialize and print the Daft DataFrame
282
+
283
+ ray_dataset = dc.read("users", read_as=dc.DatasetType.RAY_DATASET) # Ray Dataset
284
+ print("\n=== Ray Data ===")
285
+ ray_dataset.show() # Materialize and print the Ray Dataset
286
+ ```
287
+
288
+ `dc.write` can also write any of these dataset types:
289
+
290
+ ```python
291
+ import pyarrow as pa
292
+
293
+ # Create a pyarrow table to write.
294
+ pyarrow_table = pa.Table.from_pydict({
295
+ "id": [4, 5, 6],
296
+ "name": ["Tom", "Simpkin", "Delta"],
297
+ "age": [2, 12, 4],
298
+ "city": ["Hollywood", "Gloucester", "San Francisco"]
299
+ })
300
+
301
+ # Write different dataset types to the default table file format (Parquet):
302
+ dc.write(pyarrow_table, "my_pyarrow_table") # Write PyArrow Table
303
+ print("\n=== PyArrow Table ===")
304
+ dc.read("my_pyarrow_table").show()
305
+
306
+ daft_df = dc.from_pyarrow(pyarrow_table, dc.DatasetType.DAFT)
307
+ dc.write(daft_df, "my_daft_table") # Write Daft DataFrame
308
+ print("\n=== Daft Table ===")
309
+ dc.read("my_daft_table").show()
310
+
311
+ ray_dataset = dc.from_pyarrow(pyarrow_table, dc.DatasetType.RAY_DATASET)
312
+ dc.write(ray_dataset, "my_ray_dataset") # Write Ray Dataset
313
+ print("\n=== Ray Dataset ===")
314
+ dc.read("my_ray_dataset").show()
315
+
316
+ pandas_df = dc.from_pyarrow(pyarrow_table, dc.DatasetType.PANDAS)
317
+ dc.write(pandas_df, "my_pandas_table") # Write Pandas DataFrame
318
+ print("\n=== Pandas Table ===")
319
+ dc.read("my_pandas_table").show()
320
+
321
+ polars_df = dc.from_pyarrow(pyarrow_table, dc.DatasetType.POLARS)
322
+ dc.write(polars_df, "my_polars_table") # Write Polars DataFrame
323
+ print("\n=== Polars Table ===")
324
+ dc.read("my_polars_table").show()
325
+
326
+ numpy_array = dc.from_pyarrow(pyarrow_table, dc.DatasetType.NUMPY)
327
+ dc.write(numpy_array, "my_numpy_table") # Write NumPy Array
328
+ print("\n=== NumPy Table ===")
329
+ dc.read("my_numpy_table").show()
330
+ ```
331
+
332
+ Or write to different table file formats:
333
+
334
+ ```python
335
+ data = pd.DataFrame({"id": [1], "name": ["Cheshire"], "age": [3]})
336
+
337
+ # Start by writing to a new table with a custom list of supported readers.
338
+ # Define the content types we want to write.
339
+ content_types_to_write = {
340
+ dc.ContentType.PARQUET,
341
+ dc.ContentType.AVRO,
342
+ dc.ContentType.ORC,
343
+ dc.ContentType.FEATHER,
344
+ }
345
+ # Limit supported readers to dataset types can read the above content types.
346
+ # By default, DeltaCAT will raise an error if we attempt to write data to
347
+ # a file format that can't be read by one or more dataset types.
348
+ supported_reader_types = [
349
+ reader_type for reader_type in dc.DatasetType
350
+ if content_types_to_write <= reader_type.readable_content_types()
351
+ ]
352
+ # Write to a new table with our custom list of supported readers.
353
+ dc.write(
354
+ data,
355
+ "my_mixed_format_table",
356
+ content_type=dc.ContentType.PARQUET, # Write Parquet (Default)
357
+ table_properties={dc.TableProperty.SUPPORTED_READER_TYPES: supported_reader_types}
358
+ )
359
+
360
+ # Now write the same data to other file formats:
361
+ dc.write(data, "my_mixed_format_table", content_type=dc.ContentType.AVRO)
362
+ dc.write(data, "my_mixed_format_table", content_type=dc.ContentType.ORC)
363
+ dc.write(data, "my_mixed_format_table", content_type=dc.ContentType.FEATHER)
364
+
365
+ # Read the table back.
366
+ # All formats are automatically unified into the requested Pandas DataFrame:
367
+ pandas_df = dc.read("my_mixed_format_table", read_as=dc.DatasetType.PANDAS)
368
+ print(pandas_df)
369
+ ```
370
+
371
+ </details>
372
+
373
+ <details>
374
+
375
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Merging and Deleting Data</span></summary>
376
+
377
+ DeltaCAT can automatically merge and delete data by defining a table schema with one or more merge keys:
378
+
379
+ ```python
380
+ import deltacat as dc
381
+ import pandas as pd
382
+ import pyarrow as pa
383
+ import tempfile
384
+
385
+ # Initialize DeltaCAT with a fresh temporary catalog
386
+ dc.init_local(tempfile.mkdtemp())
387
+
388
+ # Define a schema with user_id as a merge key.
389
+ schema = dc.Schema.of([
390
+ dc.Field.of(pa.field("user_id", pa.int64()), is_merge_key=True),
391
+ dc.Field.of(pa.field("name", pa.string())),
392
+ dc.Field.of(pa.field("age", pa.int32())),
393
+ dc.Field.of(pa.field("status", pa.string())),
394
+ ])
395
+
396
+ # Initial user data
397
+ initial_users = pd.DataFrame({
398
+ "user_id": [1, 2, 3],
399
+ "name": ["Cheshire", "Dinah", "Felix"],
400
+ "age": [3, 7, 2],
401
+ "status": ["active", "active", "inactive"]
402
+ })
403
+
404
+ # Write initial data with the merge key schema
405
+ dc.write(initial_users, "users", schema=schema)
406
+
407
+ # Read the data back as a Pandas DataFrame.
408
+ df = dc.read("users", read_as=dc.DatasetType.PANDAS)
409
+ print("=== Initial Users ===")
410
+ print(df.sort_values("user_id"))
411
+
412
+ # Update data for existing users + add new users
413
+ updated_users = pd.DataFrame({
414
+ "user_id": [2, 3, 4, 5, 6],
415
+ "name": ["Dinah", "Felix", "Tom", "Simpkin", "Delta"],
416
+ "age": [7, 2, 5, 12, 4],
417
+ "status": ["premium", "active", "active", "active", "active"]
418
+ })
419
+
420
+ # Write automatically detects that the schema has a merge key and:
421
+ # 1. Updates existing records with matching user IDs.
422
+ # 2. Inserts new records with new user IDs.
423
+ dc.write(updated_users, "users", schema=schema)
424
+
425
+ # Read back to see merged results
426
+ df = dc.read("users", read_as=dc.DatasetType.PANDAS)
427
+ print("\n=== After Merge ===")
428
+ print(df.sort_values("user_id"))
429
+
430
+ # - Cheshire (user_id=1) remains unchanged
431
+ # - Dinah (user_id=2) status updated to "premium"
432
+ # - Felix (user_id=3) updated to "active"
433
+ # - New users (4,5,6), (Tom, Simpkin, Delta) added
434
+ # - No duplicate user_id values exist
435
+
436
+ # Specify the users to delete.
437
+ # We only need to specify matching merge key values.
438
+ users_to_delete = pd.DataFrame({
439
+ "user_id": [3, 5],
440
+ })
441
+
442
+ # Delete the records that match our merge keys.
443
+ dc.write(users_to_delete, "users", schema=schema, mode=dc.TableWriteMode.DELETE)
444
+
445
+ # Read the table back to confirm target users have been deleted.
446
+ df = dc.read("users", read_as=dc.DatasetType.PANDAS)
447
+ print("\n=== After Deletion ===")
448
+ print(df.sort_values("user_id"))
449
+
450
+ # - Felix (user_id=3) has been removed
451
+ # - Simpkin (user_id=5) has been removed
452
+ # - All other users remain unchanged
453
+ ```
454
+
455
+ </details>
456
+
457
+ <details>
458
+
459
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Organizing Tables with Namespaces</span></summary>
460
+
461
+ In DeltaCAT, table **Namespaces** are optional but useful for organizing related tables within a catalog:
462
+
463
+ ```python
464
+ import deltacat as dc
465
+ import pandas as pd
466
+ import tempfile
467
+
468
+ # Initialize DeltaCAT with a fresh temporary catalog
469
+ dc.init_local(tempfile.mkdtemp())
470
+
471
+ # Create some sample data for different business domains
472
+ user_data = pd.DataFrame({
473
+ "user_id": [1, 2, 3],
474
+ "name": ["Cheshire", "Dinah", "Felix"],
475
+ })
476
+
477
+ product_data = pd.DataFrame({
478
+ "product_id": [101, 102, 103],
479
+ "name": ["Mushrooms", "Fish", "Milk"],
480
+ "price": [12.99, 8.99, 3.99]
481
+ })
482
+
483
+ order_data = pd.DataFrame({
484
+ "order_id": [1001, 1002, 1003],
485
+ "user_id": [1, 2, 3],
486
+ "product_id": [101, 102, 103],
487
+ "quantity": [2, 1, 2]
488
+ })
489
+
490
+ # Write tables to different namespaces to organize them by domain
491
+ dc.write(user_data, "users", namespace="identity")
492
+ dc.write(product_data, "catalog", namespace="inventory")
493
+ dc.write(order_data, "transactions", namespace="sales")
494
+
495
+ # Read from specific namespaces
496
+ users_df = dc.read("users", namespace="identity", read_as=dc.DatasetType.PANDAS)
497
+ products_df = dc.read("catalog", namespace="inventory", read_as=dc.DatasetType.PANDAS)
498
+ orders_df = dc.read("transactions", namespace="sales", read_as=dc.DatasetType.PANDAS)
499
+
500
+ # Tables with the same name can exist in different namespaces
501
+ # Create separate marketing and finance views of users
502
+ marketing_users = pd.DataFrame({
503
+ "user_id": [1, 2, 3],
504
+ "segment": ["premium", "standard", "premium"],
505
+ "acquisition_channel": ["social", "search", "referral"]
506
+ })
507
+
508
+ finance_users = pd.DataFrame({
509
+ "user_id": [1, 2, 3],
510
+ "lifetime_payments": [25.98, 8.99, 7.98],
511
+ "preferred_payment_method": ["credit", "cash", "paypal"]
512
+ })
513
+
514
+ dc.write(marketing_users, "users", namespace="marketing")
515
+ dc.write(finance_users, "users", namespace="finance")
516
+
517
+ # Each namespace maintains its own "users" table with different schemas
518
+ marketing_df = dc.read("users", namespace="marketing", read_as=dc.DatasetType.PANDAS)
519
+ finance_df = dc.read("users", namespace="finance", read_as=dc.DatasetType.PANDAS)
520
+
521
+ print(f"\n=== Identity Namespace Users ===")
522
+ print(users_df)
523
+ print(f"\n=== Inventory Namespace Products ===")
524
+ print(products_df)
525
+ print(f"\n=== Sales Namespace Transactions ===")
526
+ print(orders_df)
527
+ print(f"\n=== Marketing Namespace Users ===")
528
+ print(marketing_df)
529
+ print(f"\n=== Finance Namespace Users ===")
530
+ print(finance_df)
531
+ ```
532
+
533
+ </details>
534
+
535
+ <details>
536
+
537
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Multi-Table Transactions</span></summary>
538
+
539
+ DeltaCAT transactions can span multiple tables and namespaces. Since all operations within a transaction either succeed or fail together, this simplifies keeping related datasets in sync across your entire catalog.
540
+
541
+ Consider the previous example that organized tables with namespaces. One table tracked customer orders, and another table tracked the lifetime payments of each customer. If one table was updated but not the other, then it would result in an accounting discrepancy. This edge case can be eliminated by using multi-table transactions:
542
+
543
+ ```python
544
+ import deltacat as dc
545
+ import pandas as pd
546
+ import pyarrow as pa
547
+ import tempfile
548
+
549
+ # Initialize DeltaCAT with a fresh temporary catalog
550
+ dc.init_local(tempfile.mkdtemp())
551
+
552
+ # Create sample product data.
553
+ product_data = pd.DataFrame({
554
+ "product_id": [101, 102, 103],
555
+ "name": ["Mushrooms", "Fish", "Milk"],
556
+ "price": [12.99, 8.99, 3.99]
557
+ })
558
+
559
+ # The product catalog can be created independently.
560
+ dc.write(product_data, "catalog", namespace="inventory")
561
+
562
+ print(f"\n=== Initial Product Data ===")
563
+ print(dc.read("catalog", namespace="inventory", read_as=dc.DatasetType.PANDAS))
564
+
565
+ # Create sample user and finance data.
566
+ user_data = pd.DataFrame({
567
+ "user_id": [1, 2, 3],
568
+ "name": ["Cheshire", "Dinah", "Felix"],
569
+ })
570
+ initial_finance = pd.DataFrame({
571
+ "user_id": [1, 2, 3],
572
+ "preferred_payment_method": ["credit", "cash", "paypal"]
573
+ })
574
+
575
+ # Define a finance schema.
576
+ # User ID is the merge key, and lifetime payments are defaulted to 0.00.
577
+ finance_schema = dc.Schema.of([
578
+ dc.Field.of(pa.field("user_id", pa.int64()), is_merge_key=True),
579
+ dc.Field.of(pa.field("lifetime_payments", pa.float64()), future_default=0.00),
580
+ dc.Field.of(pa.field("preferred_payment_method", pa.string())),
581
+ ])
582
+
583
+ # Create user identities and user finance data within a single transaction.
584
+ # Since transactions are atomic, this prevents accounting discrepancies.
585
+ with dc.transaction():
586
+ dc.write(user_data, "users", namespace="identity")
587
+ dc.write(initial_finance, "users", namespace="finance", schema=finance_schema)
588
+
589
+ print(f"\n=== Initial User Data ===")
590
+ print(dc.read("users", namespace="identity", read_as=dc.DatasetType.PANDAS))
591
+ print(f"\n=== Initial Finance Data ===")
592
+ print(dc.read("users", namespace="finance", read_as=dc.DatasetType.PANDAS))
593
+
594
+ # Create new order data
595
+ new_orders = pd.DataFrame({
596
+ "order_id": [1001, 1002, 1003],
597
+ "user_id": [1, 2, 3],
598
+ "product_id": [101, 102, 103],
599
+ "quantity": [2, 1, 2]
600
+ })
601
+
602
+ # Process new orders and update lifetime payment totals within a single transaction.
603
+ with dc.transaction():
604
+ # Step 1: Write the new orders
605
+ dc.write(new_orders, "transactions", namespace="sales")
606
+
607
+ # Step 2: Read back transactions and products to compute actual totals
608
+ orders_df = dc.read("transactions", namespace="sales", read_as=dc.DatasetType.PANDAS)
609
+ products_df = dc.read("catalog", namespace="inventory", read_as=dc.DatasetType.PANDAS)
610
+
611
+ # Step 3: Compute lifetime payment totals by joining orders with product prices
612
+ orders_with_prices = orders_df.merge(products_df, on="product_id")
613
+ orders_with_prices["total"] = orders_with_prices["quantity"] * orders_with_prices["price"]
614
+
615
+ # Calculate lifetime totals per user
616
+ finance_updates = orders_with_prices.groupby("user_id")["total"].sum().reset_index()
617
+ finance_updates.columns = ["user_id", "lifetime_payments"]
618
+
619
+ # Step 4: Write the computed totals
620
+ dc.write(finance_updates, "users", namespace="finance", mode=dc.TableWriteMode.MERGE)
621
+
622
+ # Verify that orders and and lifetime payments are kept in sync.
623
+ print(f"\n=== New Orders Processed ===")
624
+ print(dc.read("transactions", namespace="sales", read_as=dc.DatasetType.PANDAS))
625
+ print(f"\n=== Updated Finance Records ===")
626
+ print(dc.read("users", namespace="finance", read_as=dc.DatasetType.PANDAS))
627
+ ```
628
+
629
+ </details>
630
+
631
+ <details>
632
+
633
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Working with Multiple Catalogs</span></summary>
634
+
635
+ DeltaCAT lets you work with multiple catalogs in a single application. All catalogs registered with DeltaCAT are tracked by a Ray Actor to make them available to all workers in your Ray application.
636
+
637
+ For example, you may want to test a write against a local staging catalog before committing it to a shared production catalog:
638
+
639
+ ```python
640
+ import deltacat as dc
641
+ from deltacat.exceptions import TableValidationError
642
+ import pandas as pd
643
+ import pyarrow as pa
644
+ import pyarrow.compute as pc
645
+ import tempfile
646
+ from decimal import Decimal
647
+
648
+ # Initialize catalogs with separate names and catalog roots.
649
+ dc.init(catalogs={
650
+ "staging": dc.Catalog(config=dc.CatalogProperties(
651
+ root=tempfile.mkdtemp(), # Use temporary directory for staging
652
+ filesystem=pa.fs.LocalFileSystem()
653
+ )),
654
+ "prod": dc.Catalog(config=dc.CatalogProperties(
655
+ root=tempfile.mkdtemp(), # Use temporary directory for prod
656
+ filesystem=pa.fs.LocalFileSystem()
657
+ ))
658
+ })
659
+
660
+ # Create a PyArrow table with decimal256 data
661
+ decimal_table = pa.table({
662
+ "item_id": [1, 2, 3],
663
+ "price": pa.array([
664
+ Decimal("999.99"),
665
+ Decimal("1234.56"),
666
+ Decimal("567.89")
667
+ ], type=pa.decimal256(10, 2))
668
+ })
669
+
670
+ # Try to write decimal256 data to the staging table.
671
+ # DeltaCAT auto-detects that decimal256 isn't readable
672
+ # by several default supported table reader types
673
+ # (Polars, Daft, Ray Data) and raises a TableValidationError.
674
+ try:
675
+ dc.write(decimal_table, "financial_data", catalog="staging")
676
+ print("Decimal256 write succeeded")
677
+ except TableValidationError as e:
678
+ print(f"\n=== Validation Error ===")
679
+ print(e)
680
+ print("Decimal256 may break existing data consumers in prod, trying decimal128...")
681
+
682
+ # Cast the price column from decimal256 to decimal128
683
+ decimal_table = decimal_table.set_column(
684
+ decimal_table.schema.get_field_index("price"),
685
+ "price",
686
+ pc.cast(decimal_table["price"], pa.decimal128(10, 2))
687
+ )
688
+
689
+ # Write the validated decimal data to staging and ensure that the write succeeds
690
+ dc.write(decimal_table, "financial_data", catalog="staging")
691
+ print(f"\n=== Successfully Staged Data ===")
692
+ print(dc.read("financial_data", catalog="staging", read_as=dc.DatasetType.PANDAS))
693
+
694
+ # Read from staging to verify
695
+ staging_data = dc.read("financial_data", catalog="staging", read_as=dc.DatasetType.PANDAS)
696
+ assert staging_data["price"].tolist() == [Decimal("999.99"), Decimal("1234.56"), Decimal("567.89")]
697
+
698
+ # Now write the validated data to production
699
+ dc.write(decimal_table, "financial_data", catalog="prod")
700
+ print(f"\n=== Production Data ===")
701
+ print(dc.read("financial_data", catalog="prod", read_as=dc.DatasetType.PANDAS))
702
+ ```
703
+
704
+ </details>
705
+
706
+ <details>
707
+
708
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Transaction History & Time Travel</span></summary>
709
+
710
+ DeltaCAT supports time travel queries that let you read all tables in a catalog as they existed at any point in the past. Combined with multi-table transactions, this enables consistent point-in-time views across your entire data catalog.
711
+
712
+ ```python
713
+ import deltacat as dc
714
+ import pandas as pd
715
+ import tempfile
716
+ import time
717
+
718
+ # Initialize DeltaCAT with a fresh temporary catalog
719
+ dc.init_local(tempfile.mkdtemp())
720
+
721
+ # Create initial state with existing users, products, and orders
722
+ initial_users = pd.DataFrame({
723
+ "user_id": [1, 2, 3],
724
+ "name": ["Cheshire", "Dinah", "Felix"],
725
+ })
726
+
727
+ initial_products = pd.DataFrame({
728
+ "product_id": [101, 102, 103],
729
+ "name": ["Mushrooms", "Fish", "Milk"],
730
+ "price": [12.99, 8.99, 3.99]
731
+ })
732
+
733
+ initial_orders = pd.DataFrame({
734
+ "order_id": [1001, 1002, 1003],
735
+ "user_id": [1, 2, 3],
736
+ "product_id": [101, 102, 103],
737
+ "quantity": [2, 1, 2]
738
+ })
739
+
740
+ initial_finance = pd.DataFrame({
741
+ "user_id": [1, 2, 3],
742
+ "lifetime_payments": [25.98, 8.99, 7.98],
743
+ })
744
+
745
+ # Write initial state atomically with a commit message
746
+ with dc.transaction(commit_message="Initial data load: users, products, orders, and finance"):
747
+ dc.write(initial_users, "users", namespace="identity")
748
+ dc.write(initial_products, "catalog", namespace="inventory")
749
+ dc.write(initial_orders, "transactions", namespace="sales")
750
+ dc.write(initial_finance, "users", namespace="finance")
751
+
752
+ # Sleep briefly to ensure transaction timestamp separation
753
+ time.sleep(0.1)
754
+
755
+ # Later, add new orders for existing and new users
756
+ new_orders = pd.DataFrame({
757
+ "order_id": [1004, 1005, 1006, 1007, 1008],
758
+ "user_id": [1, 2, 1, 4, 5],
759
+ "product_id": [101, 102, 101, 104, 105],
760
+ "quantity": [1, 2, 3, 5, 1]
761
+ })
762
+
763
+ new_users = pd.DataFrame({
764
+ "user_id": [4, 5],
765
+ "name": ["Tom", "Simpkin"],
766
+ })
767
+
768
+ new_products = pd.DataFrame({
769
+ "product_id": [104, 105],
770
+ "name": ["Tuna", "Salmon"],
771
+ "price": [6.99, 9.99]
772
+ })
773
+
774
+ # Update finance data with new lifetime payment totals
775
+ updated_finance = pd.DataFrame({
776
+ "user_id": [1, 2, 3, 4, 5],
777
+ "lifetime_payments": [51.96, 26.97, 15.96, 34.95, 9.99] # Updated totals
778
+ })
779
+
780
+ # Execute all updates atomically - either all succeed or all fail
781
+ with dc.transaction(commit_message="Add new users and products, update finance totals"):
782
+ # Add new users, products, orders, and lifetime payment totals
783
+ dc.write(new_users, "users", namespace="identity")
784
+ dc.write(new_products, "catalog", namespace="inventory")
785
+ dc.write(new_orders, "transactions", namespace="sales")
786
+ dc.write(updated_finance, "users", namespace="finance", mode=dc.TableWriteMode.REPLACE)
787
+
788
+ # Query transaction history to find the right timestamp for time travel
789
+ print("=== Transaction History ===")
790
+ txn_history = dc.transactions(read_as=dc.DatasetType.PANDAS)
791
+ print(f"Found {len(txn_history)} transactions:")
792
+ print(txn_history[["transaction_id", "commit_message", "start_time", "end_time"]])
793
+
794
+ # Find the transaction we want to time travel back to
795
+ initial_load_txn = txn_history[
796
+ txn_history["commit_message"] == "Initial data load: users, products, orders, and finance"
797
+ ]
798
+ checkpoint_time = initial_load_txn["end_time"].iloc[0] + 1
799
+
800
+ print(f"\nUsing checkpoint time from transaction: {initial_load_txn['transaction_id'].iloc[0]}")
801
+ print(f"Commit message: {initial_load_txn['commit_message'].iloc[0]}")
802
+
803
+ # Compare current state vs historic state across all tables
804
+ print("\n=== Current State (After Updates) ===")
805
+ current_users = dc.read("users", namespace="identity", read_as=dc.DatasetType.PANDAS)
806
+ current_orders = dc.read("transactions", namespace="sales", read_as=dc.DatasetType.PANDAS)
807
+ current_finance = dc.read("users", namespace="finance", read_as=dc.DatasetType.PANDAS)
808
+
809
+ print("== Users ==")
810
+ print(current_users)
811
+ print("== Orders ==")
812
+ print(current_orders)
813
+ print("== Finance ==")
814
+ print(current_finance)
815
+
816
+ # Now query all tables as they existed at the checkpoint
817
+ print("\n=== Historic State (Before Updates) ===")
818
+ # DeltaCAT only reads transactions with end times strictly less than the given as-of time,
819
+ # so add 1 to the checkpoint time of the transaction we want to travel back to.
820
+ with dc.transaction(as_of=checkpoint_time + 1):
821
+ historic_users = dc.read("users", namespace="identity", read_as=dc.DatasetType.PANDAS)
822
+ historic_orders = dc.read("transactions", namespace="sales", read_as=dc.DatasetType.PANDAS)
823
+ historic_finance = dc.read("users", namespace="finance", read_as=dc.DatasetType.PANDAS)
824
+
825
+ print("== Users ==")
826
+ print(historic_users)
827
+ print("== Orders ==")
828
+ print(historic_orders)
829
+ print("== Finance ==")
830
+ print(historic_finance)
831
+
832
+ # Validate historic state
833
+ assert not any(historic_users["name"] == "Tom")
834
+ assert not any(historic_users["name"] == "Simpkin")
835
+ assert len(historic_orders) == 3 # Only original 3 orders
836
+
837
+ # Finance data reflects original payment totals
838
+ historic_payments = historic_finance[historic_finance["user_id"] == 1]["lifetime_payments"].iloc[0]
839
+ assert historic_payments == 25.98 # Original total, not updated 51.96
840
+
841
+ print("\nTime travel validation successful!")
842
+ ```
843
+
844
+ </details>
845
+
846
+ <details>
847
+
848
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Multimodal Batch Inference</span></summary>
849
+
850
+ DeltaCAT's support for merging new fields into existing records and multimodal datasets can be used to build a multimodal batch inference pipeline. For example, the following code indexes images of cats, then merges in new fields with breed precitions predictions for each image:
851
+
852
+ > **Requirements**: This example requires PyTorch ≥ 2.8.0 and torchvision ≥ 0.23.0. Install via: `pip install torch>=2.8.0 torchvision>=0.23.0`
853
+
854
+ ```python
855
+ import deltacat as dc
856
+ import tempfile
857
+ import daft
858
+ import torch
859
+ import pyarrow as pa
860
+
861
+ # Initialize DeltaCAT with a temporary catalog.
862
+ dc.init_local(tempfile.mkdtemp())
863
+
864
+ # Define initial schema with image_id as merge key (no prediction fields yet).
865
+ initial_schema = dc.Schema.of([
866
+ dc.Field.of(pa.field("image_id", pa.large_string()), is_merge_key=True),
867
+ dc.Field.of(pa.field("image_path", pa.large_string())),
868
+ dc.Field.of(pa.field("true_breed", pa.large_string())),
869
+ dc.Field.of(pa.field("image_bytes", pa.binary())),
870
+ ])
871
+
872
+ # Create sample Daft DataFrame with image URLs/paths.
873
+ df = daft.from_pydict({
874
+ "image_id": ["cat_001", "cat_002", "cat_003"],
875
+ "image_path": ["media/tuxedo.jpg", "media/calico.jpg", "media/siamese.jpg"],
876
+ "true_breed": ["Tuxedo", "Calico", "Siamese"]
877
+ })
878
+
879
+ # Load images and prepare for processing.
880
+ df = df.with_column("image_bytes", df["image_path"].url.download())
881
+
882
+ # Write initial dataset to DeltaCAT.
883
+ dc.write(df, "cool_cats", schema=initial_schema)
884
+
885
+ # Define an ImageClassifier UDF for cat breed prediction.
886
+ @daft.udf(return_dtype=daft.DataType.fixed_size_list(dtype=daft.DataType.string(), size=2))
887
+ class ImageClassifier:
888
+ def __init__(self):
889
+ """Initialize model once per worker for efficiency"""
890
+ self.model = torch.hub.load("NVIDIA/DeepLearningExamples:torchhub", "nvidia_resnet50", pretrained=True)
891
+ self.utils = torch.hub.load("NVIDIA/DeepLearningExamples:torchhub", "nvidia_convnets_processing_utils")
892
+ self.model.eval().to(torch.device("cpu"))
893
+
894
+ def __call__(self, image_paths):
895
+ """Process batch of images efficiently using NVIDIA utilities"""
896
+ batch = torch.cat([self.utils.prepare_input_from_uri(uri) for uri in image_paths.to_pylist()]).to(torch.device("cpu"))
897
+
898
+ with torch.no_grad():
899
+ output = torch.nn.functional.softmax(self.model(batch), dim=1)
900
+
901
+ results = self.utils.pick_n_best(predictions=output, n=1)
902
+ return [result[0] for result in results]
903
+
904
+ # Apply the UDF first to get predictions.
905
+ df_with_predictions = df.with_column("prediction", ImageClassifier(df["image_path"]))
906
+
907
+ # Run batch inference and prepare partial update data (merge key + new fields only)
908
+ # Adds predicted breed and confidence to existing records with matching image IDs.
909
+ prediction_data = df_with_predictions.select(
910
+ df_with_predictions["image_id"],
911
+ df_with_predictions["prediction"].list.get(0).alias("predicted_breed"),
912
+ (df_with_predictions["prediction"].list.get(1).str.replace("%", "").cast(daft.DataType.float64()) / 100.0).alias("confidence")
913
+ )
914
+
915
+ # Write the predictions back to the table.
916
+ dc.write(prediction_data, "cool_cats")
917
+
918
+ # Read back the merged results.
919
+ final_df = dc.read("cool_cats")
920
+ final_df = final_df.with_column("image", final_df["image_bytes"].image.decode())
921
+
922
+ # Calculate accuracy and display results.
923
+ results = final_df.select("image_id", "true_breed", "predicted_breed", "confidence").to_pandas()
924
+ accuracy = (results.apply(lambda row: row["true_breed"].lower() in row["predicted_breed"].lower(), axis=1)).mean()
925
+
926
+ print("=== Results ===")
927
+ print(f"Classification Accuracy: {accuracy:.1%}")
928
+
929
+ # Display final dataset with decoded images for visual inspection.
930
+ # Run this example in a Jupyter notebook to view the actual image data stored in DeltaCAT.
931
+ print(f"Final dataset with images and predictions:")
932
+ final_df.show()
933
+ ```
934
+
935
+ </details>
936
+
937
+ <details>
938
+
939
+ <summary><span style="font-size: 1.25em; font-weight: bold;">LLM Batch Inference</span></summary>
940
+
941
+ DeltaCAT multi-table transactions, time travel queries, and automatic schema evolution can be used to create auditable LLM batch inference pipelines. For example, the following code tries different approaches to analyze the overall tone of customer feedback, then generates customer service responses based on the analysis:
942
+
943
+ ```python
944
+ import deltacat as dc
945
+ import pandas as pd
946
+ import pyarrow as pa
947
+ import tempfile
948
+ import time
949
+ import daft
950
+ from transformers import pipeline
951
+
952
+ # Initialize DeltaCAT with a temporary catalog.
953
+ dc.init_local(tempfile.mkdtemp())
954
+
955
+ # Load customer feedback
956
+ daft_docs = daft.from_pydict({
957
+ "doc_id": [1, 2, 3],
958
+ "path": ["media/customer_feedback_001.txt", "media/customer_feedback_002.txt", "media/customer_feedback_003.txt"]
959
+ })
960
+ daft_docs = daft_docs.with_column("content", daft_docs["path"].url.download().decode("utf-8"))
961
+
962
+ # Doc processing V1.0
963
+ # Capture basic feedback sentiment analysis in a parallel multi-table transaction
964
+ with dc.transaction():
965
+ # Write the full customer feedback to a new "documents" table.
966
+ dc.write(daft_docs, "documents", namespace="analysis")
967
+
968
+ # Define a UDF to analyze customer feedback sentiment.
969
+ @daft.udf(return_dtype=daft.DataType.struct({
970
+ "analysis": daft.DataType.string(),
971
+ "confidence": daft.DataType.float64(),
972
+ "model_version": daft.DataType.string()
973
+ }))
974
+ def analyze_sentiment(content_series):
975
+ classifier = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
976
+ results = []
977
+ for content in content_series.to_pylist():
978
+ result = classifier(content[:500])[0] # Truncate for model limits
979
+ results.append({
980
+ "analysis": result['label'],
981
+ "confidence": result['score'],
982
+ "model_version": "roberta-v1.0"
983
+ })
984
+ return results
985
+
986
+ # Run sentiment analysis in parallel.
987
+ print("Running parallel customer feedback sentiment analysis...")
988
+ daft_results = daft_docs.with_column("analysis", analyze_sentiment(daft_docs["content"]))
989
+ daft_results = daft_results.select(
990
+ daft_docs["doc_id"],
991
+ daft_results["analysis"]["analysis"].alias("analysis"),
992
+ daft_results["analysis"]["confidence"].alias("confidence"),
993
+ daft_results["analysis"]["model_version"].alias("model_version")
994
+ )
995
+
996
+ # Write sentiment analysis to a new table with doc_id as the merge key.
997
+ initial_schema = dc.Schema.of([
998
+ dc.Field.of(pa.field("doc_id", pa.int64()), is_merge_key=True),
999
+ dc.Field.of(pa.field("analysis", pa.large_string())),
1000
+ dc.Field.of(pa.field("confidence", pa.float64())),
1001
+ dc.Field.of(pa.field("model_version", pa.large_string())),
1002
+ ])
1003
+ dc.write(daft_results, "insights", namespace="analysis", schema=initial_schema)
1004
+
1005
+ # Write to a new audit trail table.
1006
+ audit_df = pd.DataFrame([{
1007
+ "version": "v1.0",
1008
+ "docs_processed": dc.dataset_length(daft_docs),
1009
+ }])
1010
+ dc.write(audit_df, "audit", namespace="analysis")
1011
+
1012
+ print("=== V1.0: Customer feedback sentiment analysis processing complete! ===")
1013
+
1014
+ # Create checkpoint after v1.0 transaction commits.
1015
+ time.sleep(0.1)
1016
+ checkpoint_v1 = time.time_ns()
1017
+ time.sleep(0.1)
1018
+
1019
+ # Doc processing V2.0
1020
+ # Switch to a model that captures customer feedback emotion details.
1021
+ with dc.transaction():
1022
+ # Define a UDF to analyze customer feedback emotion details.
1023
+ @daft.udf(return_dtype=daft.DataType.struct({
1024
+ "analysis": daft.DataType.string(),
1025
+ "confidence": daft.DataType.float64(),
1026
+ "model_version": daft.DataType.string(),
1027
+ }))
1028
+ def analyze_emotions(content_series):
1029
+ classifier_v2 = pipeline("sentiment-analysis", model="j-hartmann/emotion-english-distilroberta-base")
1030
+ results = []
1031
+ for content in content_series.to_pylist():
1032
+ result = classifier_v2(content[:500])[0]
1033
+ results.append({
1034
+ "analysis": result['label'],
1035
+ "confidence": result['score'],
1036
+ "model_version": "distilroberta-v2.0",
1037
+ })
1038
+ return results
1039
+
1040
+ # Run emotion detail analysis in parallel.
1041
+ print("Running parallel customer feedback emotion detail analysis...")
1042
+ daft_emotions = daft_docs.with_column("analysis", analyze_emotions(daft_docs["content"]))
1043
+ daft_emotions = daft_emotions.select(
1044
+ daft_docs["doc_id"],
1045
+ daft_emotions["analysis"]["analysis"].alias("analysis"),
1046
+ daft_emotions["analysis"]["confidence"].alias("confidence"),
1047
+ daft_emotions["analysis"]["model_version"].alias("model_version"),
1048
+ )
1049
+
1050
+ # Merge new V2.0 insights into the existing V1.0 insights table.
1051
+ dc.write(daft_emotions, "insights", namespace="analysis")
1052
+ audit_df = pd.DataFrame([{"version": "v2.0", "docs_processed": dc.dataset_length(daft_docs)}])
1053
+ dc.write(audit_df, "audit", namespace="analysis")
1054
+
1055
+ print("=== V2.0: Customer feedback emotion analysis processing complete! ===")
1056
+
1057
+ time.sleep(0.1)
1058
+ checkpoint_v2 = time.time_ns()
1059
+ time.sleep(0.1)
1060
+
1061
+ # Doc processing V3.0
1062
+ # Generate customer service responses based on emotion analysis results.
1063
+ with dc.transaction():
1064
+ # First, read the current insights table with emotion analysis
1065
+ current_insights = dc.read("insights", namespace="analysis")
1066
+
1067
+ # Define a UDF to generate customer service responses based on analysis results.
1068
+ @daft.udf(return_dtype=daft.DataType.struct({
1069
+ "response_text": daft.DataType.string(),
1070
+ "response_model": daft.DataType.string(),
1071
+ "generated_at": daft.DataType.int64()
1072
+ }))
1073
+ def generate_responses_from_analysis(analysis_series):
1074
+ response_generator = pipeline("text-generation", model="microsoft/DialoGPT-medium")
1075
+ results = []
1076
+
1077
+ for analysis in analysis_series.to_pylist():
1078
+ # Create appropriate response prompt based on emotion analysis.
1079
+ if analysis in ["sadness"]:
1080
+ prompt = "Dear valued customer, we sincerely apologize for the inconvenience and"
1081
+ elif analysis in ["joy"]:
1082
+ prompt = "Thank you so much for your wonderful feedback! We're thrilled to hear"
1083
+ elif analysis in ["fear"]:
1084
+ prompt = "We understand your concerns and want to assure you that"
1085
+ else:
1086
+ prompt = "Thank you for contacting us. We appreciate your feedback and"
1087
+
1088
+ # Generate customer service responses.
1089
+ generated = response_generator(prompt, max_length=100, num_return_sequences=1, pad_token_id=50256)
1090
+ response_text = generated[0]['generated_text']
1091
+ results.append({
1092
+ "response_text": response_text,
1093
+ "response_model": "dialogpt-medium-v3.0",
1094
+ "generated_at": time.time_ns()
1095
+ })
1096
+ return results
1097
+
1098
+ # Run customer service response generation based on analysis results.
1099
+ print("Running parallel customer service response generation based on sentiment/emotion analysis...")
1100
+ daft_responses = current_insights.with_column(
1101
+ "response",
1102
+ generate_responses_from_analysis(current_insights["analysis"])
1103
+ )
1104
+ daft_responses = daft_responses.select(
1105
+ current_insights["doc_id"],
1106
+ daft_responses["response"]["response_text"].alias("response_text"),
1107
+ daft_responses["response"]["response_model"].alias("response_model"),
1108
+ daft_responses["response"]["generated_at"].alias("generated_at")
1109
+ )
1110
+ # Merge new V3.0 responses into the existing V2.0 insights table.
1111
+ # The new response columns are automatically joined by document ID.
1112
+ dc.write(daft_responses, "insights", namespace="analysis")
1113
+ audit_df = pd.DataFrame([{"version": "v3.0", "docs_processed": dc.dataset_length(current_insights)}])
1114
+ dc.write(audit_df, "audit", namespace="analysis")
1115
+
1116
+ print("=== V3.0: Customer service response generation processing complete! ===")
1117
+
1118
+ print("\n=== Time Travel Comparison of all Versions ===")
1119
+ with dc.transaction(as_of=checkpoint_v1):
1120
+ print(f"== V1.0 Insights (sentiment) ==")
1121
+ print(dc.read("insights", namespace="analysis").show())
1122
+ print(f"== V1.0 Audit ==")
1123
+ print(dc.read("audit", namespace="analysis").show())
1124
+
1125
+ with dc.transaction(as_of=checkpoint_v2):
1126
+ print(f"== V2.0 Insights (emotion) ==")
1127
+ print(dc.read("insights", namespace="analysis").show())
1128
+ print(f"== V2.0 Audit ==")
1129
+ print(dc.read("audit", namespace="analysis").show())
1130
+
1131
+ v3_results = dc.read("insights", namespace="analysis")
1132
+ print(f"== V3.0 Insights (customer service response) ==")
1133
+ print(dc.read("insights", namespace="analysis").show())
1134
+ print(f"== V3.0 Audit ==")
1135
+ print(dc.read("audit", namespace="analysis").show())
1136
+ ```
1137
+
1138
+ </details>
1139
+
1140
+ ## Runtime Environment Requirements
1141
+
1142
+ DeltaCAT's transaction system assumes that the host machine provides strong system clock accuracy guarantees, and that the filesystem hosting the catalog root directory offers strong consistency.
1143
+
1144
+ Taken together, these requirements make DeltaCAT suitable for production use on most major cloud computing hosts (e.g., EC2, GCE, Azure VMs) and storage systems (e.g., S3, GCS, Azure Blob Storage), but local laptops should typically be limited to testing/experimental purposes.
1145
+
1146
+ ## Additional Resources
1147
+ ### Table Documentation
1148
+
1149
+ The [Table](deltacat/docs/table/README.md) documentation provides a more comprehensive overview of DeltaCAT's table management APIs, including how to create, read, write, and manage tables.
1150
+
1151
+ ### Schema Documentation
1152
+
1153
+ The [Schema](deltacat/docs/schema/README.md) documentation provides a more comprehensive overview of DeltaCAT's schema management APIs, supported data types, file formats, and data consistency guarantees.
1154
+
1155
+ ### DeltaCAT URLs and Filesystem APIs
1156
+ The [DeltaCAT API Tests](deltacat/tests/test_deltacat_api.py) provide examples of how to efficiently explore, clone, and manipulate DeltaCAT catalogs by using DeltaCAT URLs together with filesystem-like list/copy/get/put APIs.
1157
+
1158
+ ### DeltaCAT Catalog APIs
1159
+ The [Default Catalog Tests](deltacat/tests/catalog/test_default_catalog_impl.py) provide more exhaustive examples of DeltaCAT **Catalog** API behavior.
1160
+
1161
+ ### Examples
1162
+
1163
+ The [DeltaCAT Examples](deltacat/examples/) show how to build more advanced applications like external data source indexers and custom dataset compactors. They also demonstrate some experimental features like Apache Iceberg and Apache Beam integrations.