deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,673 @@
|
|
1
|
+
import json
|
2
|
+
import sys
|
3
|
+
import pickle
|
4
|
+
import base64
|
5
|
+
from typing import Dict, List, Any
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
from deltacat.utils.pyarrow import get_base_arrow_type_name
|
9
|
+
|
10
|
+
|
11
|
+
def load_test_data(json_file: str) -> tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
12
|
+
"""Load test results and metadata from JSON file."""
|
13
|
+
with open(json_file, "r") as f:
|
14
|
+
data = json.load(f)
|
15
|
+
|
16
|
+
if isinstance(data, dict):
|
17
|
+
if "test_results" in data and "metadata" in data:
|
18
|
+
# New format with metadata
|
19
|
+
return data["test_results"], data["metadata"]
|
20
|
+
else:
|
21
|
+
raise ValueError(f"Unexpected JSON structure in {json_file}")
|
22
|
+
elif isinstance(data, list):
|
23
|
+
# Old format - just a list of results
|
24
|
+
return data, {}
|
25
|
+
else:
|
26
|
+
raise ValueError(f"Unexpected JSON structure in {json_file}")
|
27
|
+
|
28
|
+
|
29
|
+
def load_test_results(json_file: str) -> List[Dict[str, Any]]:
|
30
|
+
"""Load test results from JSON file (backward compatibility)."""
|
31
|
+
results, _ = load_test_data(json_file)
|
32
|
+
return results
|
33
|
+
|
34
|
+
|
35
|
+
def extract_physical_type_mapping_from_json(
|
36
|
+
result: Dict[str, Any], content_type_key: str
|
37
|
+
) -> str:
|
38
|
+
"""Extract physical type mapping from JSON result."""
|
39
|
+
if not result.get("success", False):
|
40
|
+
return None
|
41
|
+
|
42
|
+
# Even if PyArrow read failed, we can still extract physical schema if files were written
|
43
|
+
# The physical schema inspection happens at the file level, not via PyArrow read
|
44
|
+
|
45
|
+
physical_schema = result.get("physical_schema", {})
|
46
|
+
|
47
|
+
if physical_schema.get("error"):
|
48
|
+
return None
|
49
|
+
|
50
|
+
if content_type_key == "parquet":
|
51
|
+
columns = physical_schema.get("columns", {})
|
52
|
+
if columns:
|
53
|
+
first_col = next(iter(columns.values()))
|
54
|
+
physical_type = first_col.get("parquet_physical_type", "unknown")
|
55
|
+
logical_type = first_col.get("parquet_logical_type")
|
56
|
+
if logical_type and logical_type != "None":
|
57
|
+
return f"{physical_type} ({logical_type})"
|
58
|
+
return physical_type
|
59
|
+
|
60
|
+
elif content_type_key == "feather":
|
61
|
+
columns = physical_schema.get("columns", {})
|
62
|
+
if columns:
|
63
|
+
first_col = next(iter(columns.values()))
|
64
|
+
return first_col.get("feather_preserved_type", "unknown")
|
65
|
+
|
66
|
+
elif content_type_key == "avro":
|
67
|
+
columns = physical_schema.get("columns", {})
|
68
|
+
if columns:
|
69
|
+
first_col = next(iter(columns.values()))
|
70
|
+
avro_type = first_col.get("avro_type")
|
71
|
+
if avro_type:
|
72
|
+
return str(avro_type)
|
73
|
+
return "unknown"
|
74
|
+
|
75
|
+
elif content_type_key == "orc":
|
76
|
+
columns = physical_schema.get("columns", {})
|
77
|
+
if columns:
|
78
|
+
first_col = next(iter(columns.values()))
|
79
|
+
return first_col.get("orc_type_kind", "unknown")
|
80
|
+
|
81
|
+
return None
|
82
|
+
|
83
|
+
|
84
|
+
def generate_type_table_markdown(
|
85
|
+
arrow_type: str, arrow_description: str, results: List[Dict[str, Any]]
|
86
|
+
) -> str:
|
87
|
+
"""Generate a single type table in markdown format."""
|
88
|
+
|
89
|
+
# Filter results for this arrow type
|
90
|
+
type_results = [r for r in results if r["arrow_type"] == arrow_type]
|
91
|
+
|
92
|
+
if not type_results:
|
93
|
+
return (
|
94
|
+
f"\n#### **{arrow_description}** \nNo test results found for this type.\n"
|
95
|
+
)
|
96
|
+
|
97
|
+
# Organize results by dataset type and content type
|
98
|
+
dataset_types = ["pyarrow", "pandas", "polars", "daft", "ray_dataset"]
|
99
|
+
content_types = [
|
100
|
+
"application/parquet",
|
101
|
+
"application/feather",
|
102
|
+
"application/avro",
|
103
|
+
"application/orc",
|
104
|
+
]
|
105
|
+
content_type_keys = ["parquet", "feather", "avro", "orc"]
|
106
|
+
|
107
|
+
# Build result matrix and physical mappings per dataset type
|
108
|
+
result_matrix = {}
|
109
|
+
dataset_physical_mappings = {}
|
110
|
+
|
111
|
+
for dataset_type in dataset_types:
|
112
|
+
result_matrix[dataset_type] = {}
|
113
|
+
dataset_physical_mappings[dataset_type] = {}
|
114
|
+
|
115
|
+
for content_type in content_types:
|
116
|
+
# Find the specific result
|
117
|
+
specific_result = next(
|
118
|
+
(
|
119
|
+
r
|
120
|
+
for r in type_results
|
121
|
+
if r["dataset_type"] == dataset_type
|
122
|
+
and r["content_type"] == content_type
|
123
|
+
),
|
124
|
+
None,
|
125
|
+
)
|
126
|
+
|
127
|
+
if specific_result:
|
128
|
+
write_success = specific_result["success"]
|
129
|
+
|
130
|
+
if write_success:
|
131
|
+
result_matrix[dataset_type][content_type] = "✅"
|
132
|
+
else:
|
133
|
+
result_matrix[dataset_type][content_type] = "❌" # Write failed
|
134
|
+
|
135
|
+
# Extract physical type mapping for this dataset type
|
136
|
+
content_key = content_type.replace("application/", "")
|
137
|
+
physical_type = extract_physical_type_mapping_from_json(
|
138
|
+
specific_result, content_key
|
139
|
+
)
|
140
|
+
if physical_type and physical_type != "unknown":
|
141
|
+
dataset_physical_mappings[dataset_type][content_key] = physical_type
|
142
|
+
else:
|
143
|
+
result_matrix[dataset_type][content_type] = "❓"
|
144
|
+
|
145
|
+
# Generate markdown table
|
146
|
+
markdown = f"\n#### **{arrow_description}**\n"
|
147
|
+
markdown += "| Dataset Type | Parquet | Feather | Avro | ORC | Physical Types |\n"
|
148
|
+
markdown += "|--------------|---------|---------|------|-----|---------------|\n"
|
149
|
+
|
150
|
+
for dataset_type in dataset_types:
|
151
|
+
row_results = result_matrix.get(dataset_type, {})
|
152
|
+
parquet_result = row_results.get("application/parquet", "❓")
|
153
|
+
feather_result = row_results.get("application/feather", "❓")
|
154
|
+
avro_result = row_results.get("application/avro", "❓")
|
155
|
+
orc_result = row_results.get("application/orc", "❓")
|
156
|
+
|
157
|
+
# Build physical types string for this dataset type
|
158
|
+
dataset_mappings = dataset_physical_mappings.get(dataset_type, {})
|
159
|
+
physical_parts = []
|
160
|
+
|
161
|
+
for content_key in content_type_keys:
|
162
|
+
if content_key in dataset_mappings:
|
163
|
+
physical_parts.append(
|
164
|
+
f"{content_key.title()}:`{dataset_mappings[content_key]}`"
|
165
|
+
)
|
166
|
+
|
167
|
+
physical_col = "; ".join(physical_parts) if physical_parts else ""
|
168
|
+
|
169
|
+
markdown += f"| `{dataset_type}` | {parquet_result} | {feather_result} | {avro_result} | {orc_result} | {physical_col} |\n"
|
170
|
+
|
171
|
+
return markdown
|
172
|
+
|
173
|
+
|
174
|
+
def generate_read_compatibility_matrix_markdown(
|
175
|
+
results: List[Dict[str, Any]], arrow_type_descriptions: Dict[str, str]
|
176
|
+
) -> str:
|
177
|
+
"""Generate read compatibility matrix markdown from test results."""
|
178
|
+
|
179
|
+
# Collect all read compatibility data
|
180
|
+
read_compat_data = (
|
181
|
+
{}
|
182
|
+
) # arrow_type -> writer_dataset -> content_type -> {reader_dataset: success}
|
183
|
+
|
184
|
+
for result in results:
|
185
|
+
arrow_type = result["arrow_type"]
|
186
|
+
arrow_type_description = arrow_type_descriptions.get(arrow_type, arrow_type)
|
187
|
+
writer_dataset = result["dataset_type"]
|
188
|
+
content_type = result["content_type"]
|
189
|
+
write_success = result.get("success", False)
|
190
|
+
dataset_read_results = result.get("dataset_read_results", [])
|
191
|
+
|
192
|
+
if arrow_type_description not in read_compat_data:
|
193
|
+
read_compat_data[arrow_type_description] = {}
|
194
|
+
if writer_dataset not in read_compat_data[arrow_type_description]:
|
195
|
+
read_compat_data[arrow_type_description][writer_dataset] = {}
|
196
|
+
if content_type not in read_compat_data[arrow_type_description][writer_dataset]:
|
197
|
+
read_compat_data[arrow_type_description][writer_dataset][content_type] = {}
|
198
|
+
|
199
|
+
if write_success:
|
200
|
+
# Only process read results if the write was successful
|
201
|
+
# Add PyArrow read result based on actual read success
|
202
|
+
# If pyarrow_read_success field is missing, we can't assume it succeeded
|
203
|
+
pyarrow_read_success = result.get("pyarrow_read_success")
|
204
|
+
if pyarrow_read_success is not None:
|
205
|
+
read_compat_data[arrow_type_description][writer_dataset][content_type][
|
206
|
+
"pyarrow"
|
207
|
+
] = pyarrow_read_success
|
208
|
+
|
209
|
+
# Add other dataset type read results
|
210
|
+
for read_result in dataset_read_results:
|
211
|
+
reader_dataset = read_result["dataset_type"]
|
212
|
+
success = read_result["success"]
|
213
|
+
read_compat_data[arrow_type_description][writer_dataset][content_type][
|
214
|
+
reader_dataset
|
215
|
+
] = success
|
216
|
+
else:
|
217
|
+
# Write failed - mark all readers as incompatible (represented by "—")
|
218
|
+
# This ensures the writer appears in the table but shows no compatibility data
|
219
|
+
pass
|
220
|
+
|
221
|
+
if not read_compat_data:
|
222
|
+
return (
|
223
|
+
"\n## Read Compatibility Tables\n\nNo read compatibility data available.\n"
|
224
|
+
)
|
225
|
+
|
226
|
+
# Generate markdown
|
227
|
+
markdown = """\n## Read Compatibility Tables\n\n
|
228
|
+
The following tables show read compatibility for each Arrow type across available writer/reader combinations.\n
|
229
|
+
|
230
|
+
This information is automatically used by DeltaCAT at write time to ensure that data written in one format can be
|
231
|
+
read by all supported reader types defined in a table's `SUPPORTED_READER_TYPES` table property. If data to be
|
232
|
+
written cannot be read by one or more supported reader types, then a `TableValidationError` will be raised.
|
233
|
+
"""
|
234
|
+
|
235
|
+
# Get all dataset types that appear as readers
|
236
|
+
all_readers = set()
|
237
|
+
for arrow_data in read_compat_data.values():
|
238
|
+
for writer_data in arrow_data.values():
|
239
|
+
for content_data in writer_data.values():
|
240
|
+
all_readers.update(content_data.keys())
|
241
|
+
all_readers = sorted(list(all_readers))
|
242
|
+
|
243
|
+
# Generate table for each arrow type
|
244
|
+
for arrow_type in sorted(read_compat_data.keys()):
|
245
|
+
markdown += f"\n### {arrow_type}\n\n"
|
246
|
+
|
247
|
+
# Organize by content type
|
248
|
+
content_types = set()
|
249
|
+
for writer_data in read_compat_data[arrow_type].values():
|
250
|
+
content_types.update(writer_data.keys())
|
251
|
+
content_types = sorted(list(content_types))
|
252
|
+
|
253
|
+
for content_type in content_types:
|
254
|
+
markdown += f"\n#### {content_type}\n\n"
|
255
|
+
|
256
|
+
# Find all writers for this content type
|
257
|
+
writers = []
|
258
|
+
for writer_dataset in sorted(read_compat_data[arrow_type].keys()):
|
259
|
+
if content_type in read_compat_data[arrow_type][writer_dataset]:
|
260
|
+
writers.append(writer_dataset)
|
261
|
+
|
262
|
+
if not writers:
|
263
|
+
continue
|
264
|
+
|
265
|
+
# Create table header
|
266
|
+
markdown += "| Writer \\ Reader | " + " | ".join(all_readers) + " |\n"
|
267
|
+
markdown += "|" + "---|" * (len(all_readers) + 1) + "\n"
|
268
|
+
|
269
|
+
# Create table rows
|
270
|
+
for writer in writers:
|
271
|
+
row = [f"**{writer}**"]
|
272
|
+
reader_data = read_compat_data[arrow_type][writer][content_type]
|
273
|
+
|
274
|
+
for reader in all_readers:
|
275
|
+
if reader in reader_data:
|
276
|
+
result = reader_data[reader]
|
277
|
+
row.append("✅" if result else "❌")
|
278
|
+
else:
|
279
|
+
row.append("—")
|
280
|
+
|
281
|
+
markdown += "| " + " | ".join(row) + " |\n"
|
282
|
+
|
283
|
+
markdown += "\n"
|
284
|
+
|
285
|
+
return markdown
|
286
|
+
|
287
|
+
|
288
|
+
def _normalize_complex_types(serialized_arrow_type: str) -> str:
|
289
|
+
"""Normalize complex arrow types to their base type names without parameters.
|
290
|
+
|
291
|
+
This function uses the serialized PyArrow type for reliable normalization.
|
292
|
+
|
293
|
+
Args:
|
294
|
+
serialized_arrow_type: Base64-encoded pickled PyArrow type (required)
|
295
|
+
|
296
|
+
Returns:
|
297
|
+
Normalized type name using the common utility function
|
298
|
+
|
299
|
+
Raises:
|
300
|
+
ValueError: If serialized_arrow_type is None or deserialization fails
|
301
|
+
"""
|
302
|
+
if not serialized_arrow_type:
|
303
|
+
raise ValueError(
|
304
|
+
"serialized_arrow_type is required for reliable type normalization"
|
305
|
+
)
|
306
|
+
|
307
|
+
# Deserialize the PyArrow type from base64-encoded pickle
|
308
|
+
serialized_bytes = base64.b64decode(serialized_arrow_type)
|
309
|
+
pa_type = pickle.loads(serialized_bytes)
|
310
|
+
|
311
|
+
# Use the common utility function for normalization
|
312
|
+
return get_base_arrow_type_name(pa_type)
|
313
|
+
|
314
|
+
|
315
|
+
def generate_reader_compatibility_mapping(
|
316
|
+
results: List[Dict[str, Any]],
|
317
|
+
output_file: str = "./reader_compatibility_mapping.py",
|
318
|
+
) -> str:
|
319
|
+
"""Generate reader compatibility mapping Python file from test results."""
|
320
|
+
|
321
|
+
# Collect compatibility data: (arrow_type, writer_dataset) -> list of compatible readers
|
322
|
+
compatibility_mapping = {}
|
323
|
+
|
324
|
+
for result in results:
|
325
|
+
if not result.get("success", False):
|
326
|
+
continue
|
327
|
+
|
328
|
+
# Get serialized arrow type (required for normalization)
|
329
|
+
serialized_arrow_type = result.get("serialized_arrow_type")
|
330
|
+
|
331
|
+
# Normalize complex types to base type names using serialized type
|
332
|
+
arrow_type = _normalize_complex_types(serialized_arrow_type)
|
333
|
+
writer_dataset = result["dataset_type"]
|
334
|
+
content_type = result["content_type"]
|
335
|
+
|
336
|
+
# Create key tuple
|
337
|
+
key = (arrow_type, writer_dataset, content_type)
|
338
|
+
|
339
|
+
compatible_readers = []
|
340
|
+
|
341
|
+
# Check PyArrow read success
|
342
|
+
pyarrow_read_success = result.get("pyarrow_read_success")
|
343
|
+
if pyarrow_read_success:
|
344
|
+
compatible_readers.append("PYARROW")
|
345
|
+
|
346
|
+
# Check other dataset type read results
|
347
|
+
dataset_read_results = result.get("dataset_read_results", [])
|
348
|
+
for read_result in dataset_read_results:
|
349
|
+
reader_dataset = read_result["dataset_type"]
|
350
|
+
success = read_result["success"]
|
351
|
+
if success:
|
352
|
+
# Map to DatasetType enum values
|
353
|
+
dataset_type_mapping = {
|
354
|
+
"pyarrow": "PYARROW",
|
355
|
+
"pandas": "PANDAS",
|
356
|
+
"polars": "POLARS",
|
357
|
+
"daft": "DAFT",
|
358
|
+
"ray_dataset": "RAY_DATASET",
|
359
|
+
}
|
360
|
+
enum_value = dataset_type_mapping.get(reader_dataset)
|
361
|
+
if enum_value and enum_value not in compatible_readers:
|
362
|
+
compatible_readers.append(enum_value)
|
363
|
+
|
364
|
+
if compatible_readers:
|
365
|
+
# Merge with existing compatibility for same key (union of compatible readers)
|
366
|
+
if key in compatibility_mapping:
|
367
|
+
existing_readers = set(compatibility_mapping[key])
|
368
|
+
new_readers = set(compatible_readers)
|
369
|
+
compatibility_mapping[key] = list(existing_readers.union(new_readers))
|
370
|
+
else:
|
371
|
+
compatibility_mapping[key] = compatible_readers
|
372
|
+
|
373
|
+
# Generate Python file content
|
374
|
+
python_content = '''"""
|
375
|
+
Reader compatibility mapping generated from test results.
|
376
|
+
|
377
|
+
This mapping shows which DatasetType readers can successfully read data
|
378
|
+
written by each (arrow_type, writer_dataset_type, content_type) combination.
|
379
|
+
|
380
|
+
Keys: (arrow_type, writer_dataset_type, content_type)
|
381
|
+
Values: List of compatible DatasetType enum values
|
382
|
+
"""
|
383
|
+
|
384
|
+
from deltacat.types.tables import DatasetType
|
385
|
+
|
386
|
+
# Mapping of (arrow_type, writer_dataset_type, content_type) -> list of compatible readers
|
387
|
+
READER_COMPATIBILITY_MAPPING = {
|
388
|
+
'''
|
389
|
+
|
390
|
+
# Sort keys for consistent output
|
391
|
+
for key in sorted(compatibility_mapping.keys()):
|
392
|
+
compatible_readers = compatibility_mapping[key]
|
393
|
+
arrow_type, writer_dataset, content_type = key
|
394
|
+
|
395
|
+
# Format as Python tuple and list
|
396
|
+
readers_str = (
|
397
|
+
"["
|
398
|
+
+ ", ".join(
|
399
|
+
[f"DatasetType.{reader}" for reader in sorted(compatible_readers)]
|
400
|
+
)
|
401
|
+
+ "]"
|
402
|
+
)
|
403
|
+
python_content += f' ("{arrow_type}", "{writer_dataset}", "{content_type}"): {readers_str},\n'
|
404
|
+
|
405
|
+
python_content += '''}
|
406
|
+
|
407
|
+
def get_compatible_readers(arrow_type: str, writer_dataset_type: str, content_type: str):
|
408
|
+
"""Get list of compatible reader DatasetTypes for given combination."""
|
409
|
+
key = (arrow_type, writer_dataset_type, content_type)
|
410
|
+
compatible_readers = READER_COMPATIBILITY_MAPPING.get(key, [])
|
411
|
+
if (
|
412
|
+
DatasetType.PANDAS in compatible_readers
|
413
|
+
and DatasetType.NUMPY not in compatible_readers
|
414
|
+
):
|
415
|
+
compatible_readers = compatible_readers + [DatasetType.NUMPY]
|
416
|
+
return compatible_readers
|
417
|
+
|
418
|
+
def is_reader_compatible(arrow_type: str, writer_dataset_type: str, content_type: str, reader_dataset_type: DatasetType) -> bool:
|
419
|
+
"""Check if a specific reader is compatible with given combination."""
|
420
|
+
compatible_readers = get_compatible_readers(arrow_type, writer_dataset_type, content_type)
|
421
|
+
return reader_dataset_type in compatible_readers
|
422
|
+
'''
|
423
|
+
|
424
|
+
# Write to file
|
425
|
+
with open(output_file, "w") as f:
|
426
|
+
f.write(python_content)
|
427
|
+
|
428
|
+
print(f"✅ Generated reader compatibility mapping: {output_file}")
|
429
|
+
return output_file
|
430
|
+
|
431
|
+
|
432
|
+
def generate_complete_markdown_from_json(
|
433
|
+
json_file: str, output_file: str = "./docs/schema/README.md"
|
434
|
+
):
|
435
|
+
"""Generate complete markdown from JSON results."""
|
436
|
+
|
437
|
+
print(f"Loading results from {json_file}...")
|
438
|
+
results, metadata = load_test_data(json_file)
|
439
|
+
print(f"Loaded {len(results)} test results")
|
440
|
+
|
441
|
+
if metadata:
|
442
|
+
print(f"Found metadata with test date: {metadata.get('test_date', 'unknown')}")
|
443
|
+
print(f"PyArrow version: {metadata.get('pyarrow_version', 'unknown')}")
|
444
|
+
else:
|
445
|
+
raise ValueError(f"No metadata found in {json_file}")
|
446
|
+
|
447
|
+
# Get unique arrow types from results
|
448
|
+
arrow_types_in_results = sorted(list(set(r["arrow_type"] for r in results)))
|
449
|
+
print(
|
450
|
+
f"Found {len(arrow_types_in_results)} unique arrow types: {arrow_types_in_results}"
|
451
|
+
)
|
452
|
+
# map arrow type names to their descriptions using each results original_arrow_type field
|
453
|
+
arrow_type_descriptions = {}
|
454
|
+
for arrow_type in arrow_types_in_results:
|
455
|
+
# extract original_arrow_type field from each result
|
456
|
+
original_arrow_type = next(
|
457
|
+
(
|
458
|
+
r["original_arrow_type"]
|
459
|
+
for r in results
|
460
|
+
if r["arrow_type"] == arrow_type
|
461
|
+
),
|
462
|
+
None,
|
463
|
+
)
|
464
|
+
if original_arrow_type:
|
465
|
+
arrow_type_descriptions[arrow_type] = original_arrow_type
|
466
|
+
|
467
|
+
# Generate dynamic metadata section
|
468
|
+
test_date = metadata.get("test_date", "unknown")
|
469
|
+
if "T" in test_date:
|
470
|
+
# Convert ISO format to date only
|
471
|
+
test_date = test_date.split("T")[0]
|
472
|
+
|
473
|
+
pyarrow_version = metadata.get("pyarrow_version", "unknown")
|
474
|
+
|
475
|
+
markdown = f"""# Schemas
|
476
|
+
|
477
|
+
DeltaCAT tables may either be schemaless or backed by a schema based on the [Arrow type system](https://arrow.apache.org/docs/python/api/datatypes.html).
|
478
|
+
|
479
|
+
## Schemaless Tables
|
480
|
+
A schemaless table is created via `dc.create_table(new_table_name)` (schema omitted) or
|
481
|
+
`dc.write_to_table(data, new_table_name, schema=None)` (schema explicitly set to `None` when writing
|
482
|
+
to a new table). Schemaless tables only save a record of files written to them over time without schema
|
483
|
+
inference, data validation, or data coercion. Since it may not be possible to derive a unified schema on
|
484
|
+
read, data returned via `manifest_table = dc.read_table(table_name)` is always a **Manifest Table**
|
485
|
+
containing an ordered list of files written to the table and their manifest entry info (e.g., size,
|
486
|
+
content type, content encoding, etc.). For example:
|
487
|
+
|
488
|
+
| Column | Value | Type | Description |
|
489
|
+
|----------------------------|---------------------------|----------|------------------------------------------------------|
|
490
|
+
| author_name | "deltacat.write_to_table" | str | Manifest producer name |
|
491
|
+
| author_version | "2.0.0b12" | str | Manifest producer version |
|
492
|
+
| id | None | str | Manifest entry ID (can be None) |
|
493
|
+
| mandatory | True | bool | Raise error if file is missing (True/False) |
|
494
|
+
| meta_content_encoding | "identity" | str | File content encoding (identity = no encoding) |
|
495
|
+
| meta_content_length | 2413 | int64 | File size in bytes (2.4 KB) |
|
496
|
+
| meta_content_type | "application/parquet" | str | File format (Parquet) |
|
497
|
+
| meta_record_count | 2 | int64 | Number of records in this file |
|
498
|
+
| meta_source_content_length | 176 | int64 | Original data size in memory (176 bytes) |
|
499
|
+
| previous_stream_position | 1 | int64 | Previous delta stream position |
|
500
|
+
| stream_position | 2 | int64 | This delta's stream position |
|
501
|
+
| path | /my_catalog/data/file.pq | str | File path relative to catalog root |
|
502
|
+
|
503
|
+
If you know that all paths can be read into a standard DeltaCAT dataset type (e.g., Daft, Ray Data, PyArrow,
|
504
|
+
Pandas, Polars), then this manifest table can be materialized via
|
505
|
+
`dataframe = dc.from_manifest_table(manifest_table)`.
|
506
|
+
|
507
|
+
Once created, schemaless tables cannot be altered to have a schema.
|
508
|
+
|
509
|
+
## Standard Tables
|
510
|
+
Tables with schemas have their data validation and schema evolution behavior governed by **Schema
|
511
|
+
Consistency Types** and **Schema Evolution Modes**. This ensures that the table can always be materialized
|
512
|
+
with a unified schema at read time. By default, any DeltaCAT table created via
|
513
|
+
`dc.write_to_table(data, new_table_name)` infers a unified Arrow schema on write, and rejects writes
|
514
|
+
that would break reads for one or more supported dataset types. Once created, a standard table's
|
515
|
+
schema cannot be dropped.
|
516
|
+
|
517
|
+
## Schema Consistency Types
|
518
|
+
DeltaCAT table schemas can either be **inferred** (default behavior) to follow the shape of written data
|
519
|
+
or **enforced** to define the shape of written data. The default schema consistency type of all fields
|
520
|
+
in a DeltaCAT table schema is configured by setting the `DEFAULT_SCHEMA_CONSISTENCY_TYPE` table property
|
521
|
+
to one of the following values:
|
522
|
+
|
523
|
+
\n\n**NONE** (default): No data consistency checks are run. The schema field's type will be automatically
|
524
|
+
promoted to the most permissive Arrow data type that all values can be safely cast to using
|
525
|
+
`pyarrow.unify_schemas(schemas, promote_options="permissive")`. If safe casting is impossible,
|
526
|
+
then a `SchemaValidationError` will be raised.
|
527
|
+
|
528
|
+
\n\n**COERCE**: Coerce fields to fit the schema whenever possible, even if data truncation is required. Fields
|
529
|
+
will be coerced using either `pyarrow.compute.cast` or `daft.expression.cast` with default options. If the
|
530
|
+
field cannot be coerced to fit the given type, then a `SchemaValidationError` will be raised.
|
531
|
+
|
532
|
+
\n\n**VALIDATE**: Strict data consistency checks. An error is raised for any field that doesn't match the schema.
|
533
|
+
|
534
|
+
A field's Schema Consistency Type can only be updated from least to most permissive (VALIDATE -> COERCE -> NONE).
|
535
|
+
|
536
|
+
## Schema Evolution Modes
|
537
|
+
Schema evolution modes control how schema changes are handled when writing to a table.
|
538
|
+
A table's schema evolution mode is configured by setting the `SCHEMA_EVOLUTION_MODE`
|
539
|
+
table property to one of the following values:
|
540
|
+
|
541
|
+
\n\n**AUTO** (default): New fields are automatically added to the table schema at write time with their
|
542
|
+
Schema Consistency Type set by the `DEFAULT_SCHEMA_CONSISTENCY_TYPE` table property.
|
543
|
+
|
544
|
+
\n\n**MANUAL**: Existing schema fields with a Schema Consistency Type of `None` will continue to be automatically
|
545
|
+
updated to match the written data. New fields and other schema changes must be made explicitly via
|
546
|
+
`dc.alter_table(table_name, schema_updates=new_schema_updates)`. Attempts to write data with fields not in the
|
547
|
+
existing schema will raise a `SchemaValidationError`.
|
548
|
+
|
549
|
+
\n\n**DISABLED**: Existing schema fields with a Schema Consistency Type of `None` will continue to be automatically
|
550
|
+
updated to match the written data. All other schema changes are disabled, and manual attempts to alter the table's
|
551
|
+
schema will raise a `TableValidationError`.
|
552
|
+
|
553
|
+
A table's Schema Evolution Mode can be updated at any time.
|
554
|
+
|
555
|
+
## Arrow to File Format Type Mappings
|
556
|
+
The tables below show DeltaCAT's actual Arrow write type mappings across all supported dataset and content types.
|
557
|
+
These mappings are generated by:
|
558
|
+
|
559
|
+
1. Creating a PyArrow table with the target PyArrow data type via `pa.Table.from_arrays([pa.array(test_data, type=arrow_type)])`.
|
560
|
+
2. Casting to the target dataset type via `data = dc.from_pyarrow(pyarrow_table, target_dataset_type)`.
|
561
|
+
3. Writing to the target content type via `dc.write_to_table(data, table_name, content_type=target_content_type)`.
|
562
|
+
|
563
|
+
More details are available in the [type mapping generation script](../../deltacat/docs/autogen/schema/inference/generate_type_mappings.py).
|
564
|
+
|
565
|
+
### Runtime Environment
|
566
|
+
**Generation Date:** {test_date}
|
567
|
+
\n**PyArrow Version:** {pyarrow_version}"""
|
568
|
+
|
569
|
+
# Add other version information if available
|
570
|
+
if metadata.get("deltacat_version"):
|
571
|
+
markdown += f"\n\n**DeltaCAT Version:** {metadata['deltacat_version']}"
|
572
|
+
if metadata.get("pandas_version"):
|
573
|
+
markdown += f"\n\n**Pandas Version:** {metadata['pandas_version']}"
|
574
|
+
if metadata.get("polars_version"):
|
575
|
+
markdown += f"\n\n**Polars Version:** {metadata['polars_version']}"
|
576
|
+
if metadata.get("daft_version") and metadata["daft_version"] != "not_available":
|
577
|
+
markdown += f"\n\n**Daft Version:** {metadata['daft_version']}"
|
578
|
+
if metadata.get("ray_version") and metadata["ray_version"] != "not_available":
|
579
|
+
markdown += f"\n\n**Ray Version:** {metadata['ray_version']}"
|
580
|
+
|
581
|
+
markdown += f"""
|
582
|
+
|
583
|
+
### Type Mapping Tables
|
584
|
+
"""
|
585
|
+
|
586
|
+
# Generate tables for each arrow type
|
587
|
+
for arrow_type in arrow_types_in_results:
|
588
|
+
description = arrow_type_descriptions.get(arrow_type, arrow_type)
|
589
|
+
type_table = generate_type_table_markdown(arrow_type, description, results)
|
590
|
+
markdown += type_table
|
591
|
+
print(f"Generated table for {arrow_type}")
|
592
|
+
|
593
|
+
# Generate read compatibility matrix
|
594
|
+
print("Generating read compatibility matrix...")
|
595
|
+
read_compat_markdown = generate_read_compatibility_matrix_markdown(
|
596
|
+
results, arrow_type_descriptions
|
597
|
+
)
|
598
|
+
markdown += read_compat_markdown
|
599
|
+
print("Generated read compatibility matrix")
|
600
|
+
|
601
|
+
# Write to file
|
602
|
+
with open(output_file, "w") as f:
|
603
|
+
f.write(markdown)
|
604
|
+
|
605
|
+
print(f"✅ Generated markdown: {output_file}")
|
606
|
+
|
607
|
+
# Analyze the results to identify the physical schema extraction issues
|
608
|
+
print("\n" + "=" * 80)
|
609
|
+
print("ANALYSIS: Physical Schema Extraction Issues")
|
610
|
+
print("=" * 80)
|
611
|
+
|
612
|
+
successful_extractions = 0
|
613
|
+
failed_extractions = 0
|
614
|
+
no_physical_data = 0
|
615
|
+
|
616
|
+
for result in results:
|
617
|
+
if result.get("success", False):
|
618
|
+
physical_schema = result.get("physical_schema", {})
|
619
|
+
if physical_schema.get("error"):
|
620
|
+
failed_extractions += 1
|
621
|
+
if "no written files found" in physical_schema.get("error", "").lower():
|
622
|
+
no_physical_data += 1
|
623
|
+
elif physical_schema.get("columns"):
|
624
|
+
successful_extractions += 1
|
625
|
+
else:
|
626
|
+
no_physical_data += 1
|
627
|
+
|
628
|
+
print(f"Successful physical schema extractions: {successful_extractions}")
|
629
|
+
print(f"Failed extractions: {failed_extractions}")
|
630
|
+
print(f"No physical data: {no_physical_data}")
|
631
|
+
print(
|
632
|
+
f"Total successful tests: {len([r for r in results if r.get('success', False)])}"
|
633
|
+
)
|
634
|
+
|
635
|
+
return output_file
|
636
|
+
|
637
|
+
|
638
|
+
def main():
|
639
|
+
if len(sys.argv) < 2 or len(sys.argv) > 3:
|
640
|
+
print(
|
641
|
+
"Usage: python parse_json_type_mappings.py <json_results_file> [--python]"
|
642
|
+
)
|
643
|
+
sys.exit(1)
|
644
|
+
|
645
|
+
json_file = sys.argv[1]
|
646
|
+
generate_python = len(sys.argv) == 3 and sys.argv[2] == "--python"
|
647
|
+
|
648
|
+
if generate_python:
|
649
|
+
# Generate reader compatibility mapping
|
650
|
+
print(f"Loading results from {json_file} for compatibility mapping...")
|
651
|
+
results, _ = load_test_data(json_file)
|
652
|
+
print(f"Loaded {len(results)} test results")
|
653
|
+
|
654
|
+
# Navigate to project root for output
|
655
|
+
project_root = Path(__file__)
|
656
|
+
while project_root.name != "deltacat":
|
657
|
+
project_root = project_root.parent
|
658
|
+
output_file_path = project_root / "utils" / "reader_compatibility_mapping.py"
|
659
|
+
print(f"Writing reader compatibility mapping to {output_file_path}")
|
660
|
+
generate_reader_compatibility_mapping(results, str(output_file_path))
|
661
|
+
else:
|
662
|
+
# Generate markdown documentation
|
663
|
+
# keep navigating to parent directories until we find the docs directory
|
664
|
+
docs_dir = Path(__file__)
|
665
|
+
while docs_dir.name != "docs":
|
666
|
+
docs_dir = docs_dir.parent
|
667
|
+
output_file_path = docs_dir / "schema" / "README.md"
|
668
|
+
print(f"Writing to {output_file_path}")
|
669
|
+
generate_complete_markdown_from_json(json_file, output_file_path)
|
670
|
+
|
671
|
+
|
672
|
+
if __name__ == "__main__":
|
673
|
+
main()
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
# DeltaCAT Compactor AWS Examples
|