deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,687 @@
|
|
1
|
+
import os
|
2
|
+
import json
|
3
|
+
import pickle
|
4
|
+
import tempfile
|
5
|
+
import uuid
|
6
|
+
import base64
|
7
|
+
import shutil
|
8
|
+
from datetime import datetime
|
9
|
+
from polars.exceptions import PanicException
|
10
|
+
from typing import List, Dict, Any, Tuple
|
11
|
+
|
12
|
+
import deltacat as dc
|
13
|
+
from deltacat import Catalog
|
14
|
+
from deltacat.catalog import CatalogProperties
|
15
|
+
from deltacat.types.media import ContentType, DatasetType
|
16
|
+
from deltacat.types.tables import (
|
17
|
+
from_pyarrow,
|
18
|
+
TableWriteMode,
|
19
|
+
get_dataset_type,
|
20
|
+
get_table_length,
|
21
|
+
get_table_column_names,
|
22
|
+
get_table_schema,
|
23
|
+
)
|
24
|
+
from deltacat.storage import Metafile, Delta
|
25
|
+
from deltacat.utils.pyarrow import get_supported_test_types
|
26
|
+
import pyarrow as pa
|
27
|
+
import pyarrow.parquet as pq
|
28
|
+
import pyarrow.orc as orc
|
29
|
+
import pyarrow.feather as feather
|
30
|
+
|
31
|
+
|
32
|
+
def get_version_info():
|
33
|
+
"""Capture version information for all libraries."""
|
34
|
+
version_info = {
|
35
|
+
"test_date": datetime.now().isoformat(),
|
36
|
+
"pyarrow_version": pa.__version__,
|
37
|
+
}
|
38
|
+
|
39
|
+
# Get DeltaCAT version
|
40
|
+
try:
|
41
|
+
version_info["deltacat_version"] = dc.__version__
|
42
|
+
except AttributeError:
|
43
|
+
# Fallback if __version__ not available
|
44
|
+
try:
|
45
|
+
import pkg_resources
|
46
|
+
|
47
|
+
version_info["deltacat_version"] = pkg_resources.get_distribution(
|
48
|
+
"deltacat"
|
49
|
+
).version
|
50
|
+
except Exception:
|
51
|
+
version_info["deltacat_version"] = "unknown"
|
52
|
+
|
53
|
+
# Get Pandas version
|
54
|
+
try:
|
55
|
+
import pandas as pd
|
56
|
+
|
57
|
+
version_info["pandas_version"] = pd.__version__
|
58
|
+
except ImportError:
|
59
|
+
version_info["pandas_version"] = "not_available"
|
60
|
+
|
61
|
+
# Get Polars version
|
62
|
+
try:
|
63
|
+
import polars as pl
|
64
|
+
|
65
|
+
version_info["polars_version"] = pl.__version__
|
66
|
+
except ImportError:
|
67
|
+
version_info["polars_version"] = "not_available"
|
68
|
+
|
69
|
+
# Get Daft version
|
70
|
+
try:
|
71
|
+
import daft
|
72
|
+
|
73
|
+
version_info["daft_version"] = daft.__version__
|
74
|
+
except (ImportError, AttributeError):
|
75
|
+
version_info["daft_version"] = "not_available"
|
76
|
+
|
77
|
+
# Get Ray version
|
78
|
+
try:
|
79
|
+
import ray
|
80
|
+
|
81
|
+
version_info["ray_version"] = ray.__version__
|
82
|
+
except (ImportError, AttributeError):
|
83
|
+
version_info["ray_version"] = "not_available"
|
84
|
+
|
85
|
+
return version_info
|
86
|
+
|
87
|
+
|
88
|
+
def get_comprehensive_test_types() -> List[Tuple[str, str, List[Any]]]:
|
89
|
+
"""Get comprehensive Arrow types for testing."""
|
90
|
+
return get_supported_test_types()
|
91
|
+
|
92
|
+
|
93
|
+
def extract_file_paths_from_deltas(all_objects: List[Any]) -> List[str]:
|
94
|
+
"""Extract file paths from Delta objects by parsing manifest entries."""
|
95
|
+
file_paths = []
|
96
|
+
|
97
|
+
for obj in all_objects:
|
98
|
+
obj_type = Metafile.get_class(obj)
|
99
|
+
|
100
|
+
if obj_type == Delta:
|
101
|
+
delta_obj = obj
|
102
|
+
# Access manifest entries to get file paths
|
103
|
+
if hasattr(delta_obj, "manifest") and delta_obj.manifest:
|
104
|
+
manifest = delta_obj.manifest
|
105
|
+
if hasattr(manifest, "entries") and manifest.entries:
|
106
|
+
for entry in manifest.entries:
|
107
|
+
file_url = entry.uri or entry.url
|
108
|
+
|
109
|
+
# Convert file:// URLs to local paths
|
110
|
+
if file_url.startswith("file://"):
|
111
|
+
file_path = file_url[7:]
|
112
|
+
else:
|
113
|
+
file_path = file_url
|
114
|
+
|
115
|
+
file_paths.append(file_path)
|
116
|
+
|
117
|
+
return file_paths
|
118
|
+
|
119
|
+
|
120
|
+
def inspect_specific_file_physical_schema(
|
121
|
+
file_path: str, content_type: ContentType
|
122
|
+
) -> Dict[str, Any]:
|
123
|
+
"""Inspect the physical schema of a specific file."""
|
124
|
+
|
125
|
+
try:
|
126
|
+
if not os.path.exists(file_path):
|
127
|
+
return {"error": f"File not found: {file_path}"}
|
128
|
+
|
129
|
+
if content_type == ContentType.PARQUET:
|
130
|
+
parquet_file = pq.ParquetFile(file_path)
|
131
|
+
arrow_schema = parquet_file.schema_arrow
|
132
|
+
parquet_schema = parquet_file.schema
|
133
|
+
parquet_schema_string = str(parquet_schema)
|
134
|
+
column_info = {}
|
135
|
+
parquet_col_index = 0
|
136
|
+
|
137
|
+
for i in range(len(arrow_schema)):
|
138
|
+
arrow_field = arrow_schema.field(i)
|
139
|
+
arrow_type_str = str(arrow_field.type)
|
140
|
+
|
141
|
+
# For collection types, we need to handle them specially
|
142
|
+
col = parquet_schema.column(parquet_col_index)
|
143
|
+
if col.max_definition_level > 1 or col.max_repetition_level > 1:
|
144
|
+
parquet_physical_type_name_suffix = "Unknown"
|
145
|
+
if col.max_repetition_level > 0 and "list" in parquet_schema_string:
|
146
|
+
parquet_physical_type_name_suffix = "List"
|
147
|
+
elif col.max_definition_level > 0:
|
148
|
+
if "map" in parquet_schema_string:
|
149
|
+
parquet_physical_type_name_suffix = "Map"
|
150
|
+
else:
|
151
|
+
parquet_physical_type_name_suffix = "Struct"
|
152
|
+
parquet_physical_type_name_prefix = (
|
153
|
+
f"{col.max_definition_level}-Level"
|
154
|
+
if col.max_definition_level > 0
|
155
|
+
else ""
|
156
|
+
)
|
157
|
+
parquet_physical_type_name = f"{parquet_physical_type_name_prefix} {parquet_physical_type_name_suffix}"
|
158
|
+
parquet_logical_type_name = (
|
159
|
+
"LIST"
|
160
|
+
if "(List)" in parquet_schema_string
|
161
|
+
else "MAP"
|
162
|
+
if "(Map)" in parquet_schema_string
|
163
|
+
else ""
|
164
|
+
)
|
165
|
+
# For collection types, use the Arrow type as the "physical" representation
|
166
|
+
# since Parquet's physical schema doesn't directly represent these structures
|
167
|
+
print(f"Logical Type: {parquet_logical_type_name}")
|
168
|
+
print(f"Physical Type: {parquet_physical_type_name}")
|
169
|
+
print(f"Path: {col.path}")
|
170
|
+
print(f"Max Definition Level: {col.max_definition_level}")
|
171
|
+
print(f"Max Repetition Level: {col.max_repetition_level}")
|
172
|
+
column_info[f"column_{i}"] = {
|
173
|
+
"arrow_type": arrow_type_str,
|
174
|
+
"parquet_physical_type": parquet_physical_type_name,
|
175
|
+
"parquet_logical_type": parquet_logical_type_name,
|
176
|
+
"parquet_converted_type": "unknown",
|
177
|
+
"nullable": arrow_field.nullable,
|
178
|
+
}
|
179
|
+
# Skip the nested columns that are part of this complex type
|
180
|
+
if "list<" in arrow_type_str.lower():
|
181
|
+
parquet_col_index += 1 # Lists have nested structure
|
182
|
+
elif "struct<" in arrow_type_str.lower():
|
183
|
+
# Count the number of fields in the struct
|
184
|
+
struct_fields = (
|
185
|
+
arrow_type_str.count(",") + 1
|
186
|
+
if "," in arrow_type_str
|
187
|
+
else 1
|
188
|
+
)
|
189
|
+
parquet_col_index += struct_fields
|
190
|
+
elif "dictionary<" in arrow_type_str.lower():
|
191
|
+
parquet_col_index += 1 # Dictionary has values storage
|
192
|
+
else:
|
193
|
+
# For simple types, use the actual Parquet column info
|
194
|
+
try:
|
195
|
+
col = parquet_schema.column(parquet_col_index)
|
196
|
+
column_info[f"column_{i}"] = {
|
197
|
+
"arrow_type": arrow_type_str,
|
198
|
+
"parquet_physical_type": str(col.physical_type),
|
199
|
+
"parquet_logical_type": str(col.logical_type)
|
200
|
+
if col.logical_type
|
201
|
+
else None,
|
202
|
+
"parquet_converted_type": str(col.converted_type)
|
203
|
+
if col.converted_type
|
204
|
+
else None,
|
205
|
+
"nullable": arrow_field.nullable,
|
206
|
+
}
|
207
|
+
parquet_col_index += 1
|
208
|
+
except (IndexError, Exception):
|
209
|
+
# Fallback if we can't match to parquet column
|
210
|
+
column_info[f"column_{i}"] = {
|
211
|
+
"arrow_type": arrow_type_str,
|
212
|
+
"parquet_physical_type": "UNKNOWN",
|
213
|
+
"parquet_logical_type": None,
|
214
|
+
"parquet_converted_type": None,
|
215
|
+
"nullable": arrow_field.nullable,
|
216
|
+
}
|
217
|
+
|
218
|
+
return {
|
219
|
+
"format": "parquet",
|
220
|
+
"columns": column_info,
|
221
|
+
"file_size": os.path.getsize(file_path),
|
222
|
+
"file_path": file_path,
|
223
|
+
}
|
224
|
+
|
225
|
+
elif content_type == ContentType.FEATHER:
|
226
|
+
feather_table = feather.read_table(file_path)
|
227
|
+
|
228
|
+
column_info = {}
|
229
|
+
for i, field in enumerate(feather_table.schema):
|
230
|
+
column_info[f"column_{i}"] = {
|
231
|
+
"arrow_type": str(field.type),
|
232
|
+
"feather_preserved_type": str(field.type),
|
233
|
+
"nullable": field.nullable,
|
234
|
+
}
|
235
|
+
|
236
|
+
return {
|
237
|
+
"format": "feather",
|
238
|
+
"columns": column_info,
|
239
|
+
"file_size": os.path.getsize(file_path),
|
240
|
+
"file_path": file_path,
|
241
|
+
}
|
242
|
+
|
243
|
+
elif content_type == ContentType.AVRO:
|
244
|
+
# For Avro, use fastavro to read the schema
|
245
|
+
import fastavro
|
246
|
+
|
247
|
+
with open(file_path, "rb") as f:
|
248
|
+
reader = fastavro.reader(f)
|
249
|
+
avro_schema = reader.writer_schema
|
250
|
+
|
251
|
+
column_info = {}
|
252
|
+
if "fields" in avro_schema:
|
253
|
+
for i, field in enumerate(avro_schema["fields"]):
|
254
|
+
field_type = field["type"]
|
255
|
+
# Handle union types (used for nullable fields)
|
256
|
+
if isinstance(field_type, list):
|
257
|
+
# Find the non-null type in union
|
258
|
+
non_null_types = [t for t in field_type if t != "null"]
|
259
|
+
if non_null_types:
|
260
|
+
field_type = non_null_types[0]
|
261
|
+
nullable = "null" in field_type
|
262
|
+
else:
|
263
|
+
nullable = False
|
264
|
+
|
265
|
+
column_info[f"column_{i}"] = {
|
266
|
+
"field_name": field["name"],
|
267
|
+
"avro_type": str(field_type),
|
268
|
+
"nullable": nullable,
|
269
|
+
"original_field": field,
|
270
|
+
}
|
271
|
+
|
272
|
+
return {
|
273
|
+
"format": "avro",
|
274
|
+
"columns": column_info,
|
275
|
+
"avro_schema": avro_schema,
|
276
|
+
"file_size": os.path.getsize(file_path),
|
277
|
+
"file_path": file_path,
|
278
|
+
}
|
279
|
+
|
280
|
+
elif content_type == ContentType.ORC:
|
281
|
+
orc_file = orc.ORCFile(file_path)
|
282
|
+
|
283
|
+
column_info = {}
|
284
|
+
for i, field in enumerate(orc_file.schema):
|
285
|
+
column_info[f"column_{i}"] = {
|
286
|
+
"arrow_type": str(field.type),
|
287
|
+
"orc_type_kind": str(field.type).split("(")[0]
|
288
|
+
if "(" in str(field.type)
|
289
|
+
else str(field.type),
|
290
|
+
"nullable": field.nullable,
|
291
|
+
}
|
292
|
+
|
293
|
+
return {
|
294
|
+
"format": "orc",
|
295
|
+
"columns": column_info,
|
296
|
+
"file_size": os.path.getsize(file_path),
|
297
|
+
"file_path": file_path,
|
298
|
+
}
|
299
|
+
|
300
|
+
except (PanicException, Exception) as e:
|
301
|
+
return {
|
302
|
+
"error": f"Physical inspection failed: {str(e)}",
|
303
|
+
"error_type": type(e).__name__,
|
304
|
+
"file_path": file_path,
|
305
|
+
}
|
306
|
+
|
307
|
+
|
308
|
+
def test_dataset_read_compatibility(
|
309
|
+
table_name: str,
|
310
|
+
namespace: str,
|
311
|
+
catalog_name: str,
|
312
|
+
dataset_types: List[DatasetType],
|
313
|
+
) -> List[Dict[str, Any]]:
|
314
|
+
"""Test reading the table with different dataset types."""
|
315
|
+
read_results = []
|
316
|
+
|
317
|
+
for read_dataset_type in dataset_types:
|
318
|
+
print(f" Testing read with {read_dataset_type.value}")
|
319
|
+
try:
|
320
|
+
read_result = dc.read_table(
|
321
|
+
table=table_name,
|
322
|
+
namespace=namespace,
|
323
|
+
catalog=catalog_name,
|
324
|
+
read_as=read_dataset_type,
|
325
|
+
max_parallelism=1,
|
326
|
+
)
|
327
|
+
|
328
|
+
# Verify the actual dataset type matches what we expected
|
329
|
+
actual_dataset_type = get_dataset_type(read_result)
|
330
|
+
|
331
|
+
# Extract basic information about the read result
|
332
|
+
result_info = {
|
333
|
+
"dataset_type": read_dataset_type.value,
|
334
|
+
"actual_dataset_type": actual_dataset_type.value,
|
335
|
+
"success": True,
|
336
|
+
"error": None,
|
337
|
+
"result_type": type(read_result).__name__,
|
338
|
+
}
|
339
|
+
|
340
|
+
# Use proper utility functions based on expected dataset type
|
341
|
+
try:
|
342
|
+
result_info["num_rows"] = get_table_length(read_result)
|
343
|
+
except Exception as e:
|
344
|
+
result_info["num_rows"] = f"Error getting length: {str(e)}"
|
345
|
+
|
346
|
+
try:
|
347
|
+
column_names = get_table_column_names(read_result)
|
348
|
+
result_info["num_columns"] = len(column_names)
|
349
|
+
result_info["column_names"] = column_names
|
350
|
+
except Exception as e:
|
351
|
+
result_info["num_columns"] = f"Error getting columns: {str(e)}"
|
352
|
+
|
353
|
+
# Get schema information using the utility function
|
354
|
+
try:
|
355
|
+
schema = get_table_schema(read_result)
|
356
|
+
result_info["schema"] = str(schema)
|
357
|
+
if schema.metadata is not None:
|
358
|
+
result_info["has_metadata"] = True
|
359
|
+
except Exception as e:
|
360
|
+
result_info["schema"] = f"Schema error: {str(e)}"
|
361
|
+
|
362
|
+
read_results.append(result_info)
|
363
|
+
print(f" ✅ Read successful")
|
364
|
+
|
365
|
+
except (PanicException, Exception) as e:
|
366
|
+
read_results.append(
|
367
|
+
{
|
368
|
+
"dataset_type": read_dataset_type.value,
|
369
|
+
"success": False,
|
370
|
+
"error": str(e),
|
371
|
+
"error_type": type(e).__name__,
|
372
|
+
"result_type": None,
|
373
|
+
"schema": None,
|
374
|
+
"num_columns": 0,
|
375
|
+
"num_rows": 0,
|
376
|
+
}
|
377
|
+
)
|
378
|
+
print(f" ❌ Read failed: {str(e)[:100]}...")
|
379
|
+
|
380
|
+
return read_results
|
381
|
+
|
382
|
+
|
383
|
+
def run_single_test(
|
384
|
+
arrow_type_name: str,
|
385
|
+
arrow_type_code: str,
|
386
|
+
test_data: List[Any],
|
387
|
+
dataset_type: DatasetType,
|
388
|
+
content_type: ContentType,
|
389
|
+
catalog_name: str,
|
390
|
+
) -> Dict[str, Any]:
|
391
|
+
"""Run a single test with proper file-to-test mapping using dc.list."""
|
392
|
+
|
393
|
+
try:
|
394
|
+
# Create Arrow table
|
395
|
+
arrow_type = eval(arrow_type_code)
|
396
|
+
arrow_table = pa.Table.from_arrays(
|
397
|
+
[pa.array(test_data, type=arrow_type)], names=[arrow_type_name]
|
398
|
+
)
|
399
|
+
|
400
|
+
# Convert to dataset type
|
401
|
+
write_dataset = from_pyarrow(arrow_table, dataset_type)
|
402
|
+
|
403
|
+
# Create unique table name with timestamp to avoid conflicts
|
404
|
+
timestamp = datetime.now().strftime("%H%M%S%f")
|
405
|
+
table_name = f"test_{arrow_type_name}_{dataset_type.value}_{content_type.value.replace('/', '_')}_{timestamp}"
|
406
|
+
namespace = "test_namespace"
|
407
|
+
|
408
|
+
print(f" Writing to table: {table_name}")
|
409
|
+
|
410
|
+
# Write to DeltaCAT with reader compatibility validation disabled
|
411
|
+
dc.write_to_table(
|
412
|
+
data=write_dataset,
|
413
|
+
table=table_name,
|
414
|
+
namespace=namespace,
|
415
|
+
catalog=catalog_name,
|
416
|
+
mode=TableWriteMode.CREATE,
|
417
|
+
content_type=content_type,
|
418
|
+
table_properties={
|
419
|
+
"supported_reader_types": None # Disable reader compatibility validation
|
420
|
+
},
|
421
|
+
)
|
422
|
+
|
423
|
+
# Try to read back with PyArrow for type verification
|
424
|
+
pyarrow_read_success = True
|
425
|
+
read_result = None
|
426
|
+
pyarrow_read_error = None
|
427
|
+
|
428
|
+
try:
|
429
|
+
read_result = dc.read_table(
|
430
|
+
table=table_name,
|
431
|
+
namespace=namespace,
|
432
|
+
catalog=catalog_name,
|
433
|
+
read_as=DatasetType.PYARROW,
|
434
|
+
max_parallelism=1,
|
435
|
+
)
|
436
|
+
print(f" ✅ PyArrow read-back successful")
|
437
|
+
except Exception as e:
|
438
|
+
pyarrow_read_success = False
|
439
|
+
pyarrow_read_error = str(e)
|
440
|
+
print(f" ⚠️ PyArrow read-back failed: {str(e)[:100]}...")
|
441
|
+
|
442
|
+
# Test read compatibility with different dataset types
|
443
|
+
print(f" Testing read compatibility with other dataset types...")
|
444
|
+
additional_dataset_types = [
|
445
|
+
DatasetType.PANDAS,
|
446
|
+
DatasetType.POLARS,
|
447
|
+
DatasetType.DAFT,
|
448
|
+
DatasetType.RAY_DATASET,
|
449
|
+
]
|
450
|
+
|
451
|
+
dataset_read_results = test_dataset_read_compatibility(
|
452
|
+
table_name, namespace, catalog_name, additional_dataset_types
|
453
|
+
)
|
454
|
+
|
455
|
+
# Use dc.list with recursive=True to find the objects for this specific table
|
456
|
+
table_url = dc.DeltaCatUrl(f"dc://{catalog_name}/{namespace}/{table_name}")
|
457
|
+
print(f" Listing objects for: {table_url}")
|
458
|
+
|
459
|
+
try:
|
460
|
+
table_objects = dc.list(table_url, recursive=True)
|
461
|
+
print(f" Found {len(table_objects)} objects for table")
|
462
|
+
|
463
|
+
# Extract file paths from Delta objects
|
464
|
+
file_paths = extract_file_paths_from_deltas(table_objects)
|
465
|
+
print(f" Extracted {len(file_paths)} file paths")
|
466
|
+
|
467
|
+
if file_paths:
|
468
|
+
# Use the first file path (should be the one we just wrote)
|
469
|
+
file_path = file_paths[0]
|
470
|
+
print(f" Inspecting file: {file_path}")
|
471
|
+
|
472
|
+
# Inspect the physical schema of this specific file
|
473
|
+
physical_schema = inspect_specific_file_physical_schema(
|
474
|
+
file_path, content_type
|
475
|
+
)
|
476
|
+
else:
|
477
|
+
physical_schema = {"error": "No file paths found in Delta objects"}
|
478
|
+
|
479
|
+
except Exception as e:
|
480
|
+
physical_schema = {"error": f"Failed to list table objects: {str(e)}"}
|
481
|
+
|
482
|
+
# Serialize the PyArrow type for reliable deserialization later
|
483
|
+
serialized_arrow_type = base64.b64encode(pickle.dumps(arrow_type)).decode(
|
484
|
+
"utf-8"
|
485
|
+
)
|
486
|
+
|
487
|
+
return {
|
488
|
+
"arrow_type": arrow_type_name,
|
489
|
+
"dataset_type": dataset_type.value,
|
490
|
+
"content_type": content_type.value,
|
491
|
+
"success": True, # Write was successful
|
492
|
+
"pyarrow_read_success": pyarrow_read_success,
|
493
|
+
"pyarrow_read_error": pyarrow_read_error,
|
494
|
+
"original_arrow_type": str(arrow_type),
|
495
|
+
"serialized_arrow_type": serialized_arrow_type,
|
496
|
+
"read_back_type": str(read_result.schema.field(0).type)
|
497
|
+
if read_result and hasattr(read_result, "schema")
|
498
|
+
else "unknown",
|
499
|
+
"physical_schema": physical_schema,
|
500
|
+
"type_preserved": str(arrow_type) == str(read_result.schema.field(0).type)
|
501
|
+
if read_result and hasattr(read_result, "schema")
|
502
|
+
else False,
|
503
|
+
"error": None,
|
504
|
+
"table_name": table_name,
|
505
|
+
"dataset_read_results": dataset_read_results,
|
506
|
+
}
|
507
|
+
|
508
|
+
except (PanicException, Exception) as e:
|
509
|
+
print(f" Test failed with error: {str(e)}")
|
510
|
+
|
511
|
+
# Try to serialize the arrow_type even on failure (if arrow_type was created)
|
512
|
+
try:
|
513
|
+
arrow_type = eval(arrow_type_code)
|
514
|
+
original_arrow_type = str(arrow_type)
|
515
|
+
serialized_arrow_type = base64.b64encode(pickle.dumps(arrow_type)).decode(
|
516
|
+
"utf-8"
|
517
|
+
)
|
518
|
+
except Exception:
|
519
|
+
# If we can't create the arrow_type, we can't serialize it
|
520
|
+
original_arrow_type = "unknown"
|
521
|
+
serialized_arrow_type = None
|
522
|
+
|
523
|
+
return {
|
524
|
+
"arrow_type": arrow_type_name,
|
525
|
+
"dataset_type": dataset_type.value,
|
526
|
+
"content_type": content_type.value,
|
527
|
+
"success": False, # Write failed
|
528
|
+
"pyarrow_read_success": False,
|
529
|
+
"pyarrow_read_error": None, # Write failed, not read
|
530
|
+
"original_arrow_type": original_arrow_type,
|
531
|
+
"serialized_arrow_type": serialized_arrow_type,
|
532
|
+
"read_back_type": "unknown",
|
533
|
+
"physical_schema": {},
|
534
|
+
"type_preserved": False,
|
535
|
+
"error": str(e),
|
536
|
+
"error_category": "unknown",
|
537
|
+
"table_name": f"failed_{arrow_type_name}_{dataset_type.value}",
|
538
|
+
"dataset_read_results": [],
|
539
|
+
}
|
540
|
+
|
541
|
+
|
542
|
+
def run_type_mapping_tests(catalog_name: str) -> List[Dict[str, Any]]:
|
543
|
+
"""Run the actual type mapping tests and return results."""
|
544
|
+
arrow_types = get_comprehensive_test_types()
|
545
|
+
dataset_types = [
|
546
|
+
DatasetType.PYARROW,
|
547
|
+
DatasetType.PANDAS,
|
548
|
+
DatasetType.POLARS,
|
549
|
+
DatasetType.DAFT,
|
550
|
+
DatasetType.RAY_DATASET,
|
551
|
+
] # All dataset types
|
552
|
+
content_types = [
|
553
|
+
ContentType.PARQUET,
|
554
|
+
ContentType.FEATHER,
|
555
|
+
ContentType.AVRO,
|
556
|
+
ContentType.ORC,
|
557
|
+
] # Test 4 content types
|
558
|
+
|
559
|
+
print(
|
560
|
+
f"Testing {len(arrow_types)} Arrow types × {len(dataset_types)} dataset types × {len(content_types)} content types"
|
561
|
+
)
|
562
|
+
print()
|
563
|
+
|
564
|
+
all_results = []
|
565
|
+
test_count = 0
|
566
|
+
total_tests = len(arrow_types) * len(dataset_types) * len(content_types)
|
567
|
+
|
568
|
+
for arrow_type_name, arrow_type_code, test_data in arrow_types:
|
569
|
+
print(f"Testing PyArrow type: {arrow_type_name}")
|
570
|
+
|
571
|
+
for dataset_type in dataset_types:
|
572
|
+
for content_type in content_types:
|
573
|
+
test_count += 1
|
574
|
+
print(
|
575
|
+
f" [{test_count:2d}/{total_tests}] {dataset_type.value} → {content_type.value}"
|
576
|
+
)
|
577
|
+
|
578
|
+
result = run_single_test(
|
579
|
+
arrow_type_name,
|
580
|
+
arrow_type_code,
|
581
|
+
test_data,
|
582
|
+
dataset_type,
|
583
|
+
content_type,
|
584
|
+
catalog_name,
|
585
|
+
)
|
586
|
+
|
587
|
+
if result["success"]:
|
588
|
+
# Write was successful, check read status
|
589
|
+
read_status = (
|
590
|
+
"✅" if result.get("pyarrow_read_success", True) else "⚠️"
|
591
|
+
)
|
592
|
+
|
593
|
+
if result["physical_schema"].get("error"):
|
594
|
+
print(
|
595
|
+
f" {read_status} Write ✅, Physical schema error: {result['physical_schema']['error']}"
|
596
|
+
)
|
597
|
+
else:
|
598
|
+
# Show extracted physical type
|
599
|
+
columns = result["physical_schema"].get("columns", {})
|
600
|
+
if columns:
|
601
|
+
first_col = next(iter(columns.values()))
|
602
|
+
if content_type == ContentType.PARQUET:
|
603
|
+
physical_type = first_col.get(
|
604
|
+
"parquet_physical_type", "unknown"
|
605
|
+
)
|
606
|
+
print(
|
607
|
+
f" {read_status} Write ✅, Physical type: {physical_type}"
|
608
|
+
)
|
609
|
+
elif content_type == ContentType.FEATHER:
|
610
|
+
physical_type = first_col.get(
|
611
|
+
"feather_preserved_type", "unknown"
|
612
|
+
)
|
613
|
+
print(
|
614
|
+
f" {read_status} Write ✅, Physical type: {physical_type}"
|
615
|
+
)
|
616
|
+
elif content_type == ContentType.AVRO:
|
617
|
+
physical_type = first_col.get("avro_type", "unknown")
|
618
|
+
print(
|
619
|
+
f" {read_status} Write ✅, Physical type: {physical_type}"
|
620
|
+
)
|
621
|
+
elif content_type == ContentType.ORC:
|
622
|
+
physical_type = first_col.get(
|
623
|
+
"orc_type_kind", "unknown"
|
624
|
+
)
|
625
|
+
print(
|
626
|
+
f" {read_status} Write ✅, Physical type: {physical_type}"
|
627
|
+
)
|
628
|
+
else:
|
629
|
+
print(f" {read_status} Write ✅, No column info found")
|
630
|
+
|
631
|
+
# Show read error if any
|
632
|
+
if not result.get("pyarrow_read_success", True):
|
633
|
+
read_error = result.get("pyarrow_read_error", "unknown")
|
634
|
+
print(f" PyArrow read failed: {read_error[:100]}...")
|
635
|
+
else:
|
636
|
+
print(f" ❌ Write failed: {result.get('error', 'unknown')}")
|
637
|
+
|
638
|
+
all_results.append(result)
|
639
|
+
print()
|
640
|
+
|
641
|
+
return all_results
|
642
|
+
|
643
|
+
|
644
|
+
def main():
|
645
|
+
print("=" * 80)
|
646
|
+
print("PHYSICAL SCHEMA EXTRACTION TEST")
|
647
|
+
print("=" * 80)
|
648
|
+
print("Using dc.list with table-specific URLs to map files to tests")
|
649
|
+
|
650
|
+
# Setup
|
651
|
+
temp_dir = tempfile.mkdtemp()
|
652
|
+
catalog_name = f"test-catalog-{uuid.uuid4()}"
|
653
|
+
catalog_props = CatalogProperties(root=temp_dir)
|
654
|
+
dc.put_catalog(catalog_name, catalog=Catalog(config=catalog_props))
|
655
|
+
|
656
|
+
print(f"Using catalog directory: {temp_dir}")
|
657
|
+
|
658
|
+
try:
|
659
|
+
# Run the tests
|
660
|
+
all_results = run_type_mapping_tests(catalog_name)
|
661
|
+
|
662
|
+
# Save detailed results with version information
|
663
|
+
version_info = get_version_info()
|
664
|
+
output_data = {"metadata": version_info, "test_results": all_results}
|
665
|
+
|
666
|
+
output_file_name = "generate_type_mappings_results.json"
|
667
|
+
with open(output_file_name, "w") as f:
|
668
|
+
json.dump(output_data, f, indent=2, default=str)
|
669
|
+
|
670
|
+
print(f"Detailed results saved to: {output_file_name}")
|
671
|
+
print(f"Catalog directory: {temp_dir}")
|
672
|
+
|
673
|
+
finally:
|
674
|
+
# Clean up test catalog and temporary directory
|
675
|
+
try:
|
676
|
+
dc.clear_catalogs() # Clear catalog from memory
|
677
|
+
shutil.rmtree(temp_dir) # Remove temporary directory and all contents
|
678
|
+
print(f"✅ Cleaned up test catalog directory: {temp_dir}")
|
679
|
+
except Exception as cleanup_error:
|
680
|
+
print(
|
681
|
+
f"⚠️ Warning: Failed to clean up catalog directory {temp_dir}: {cleanup_error}"
|
682
|
+
)
|
683
|
+
print("NOTE: You may need to manually delete this directory")
|
684
|
+
|
685
|
+
|
686
|
+
if __name__ == "__main__":
|
687
|
+
main()
|