deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -11,15 +11,15 @@ from deltacat.exceptions import (
|
|
11
11
|
UnclassifiedDeltaCatError,
|
12
12
|
)
|
13
13
|
from daft.exceptions import DaftTransientError
|
14
|
-
from deltacat.tests.
|
14
|
+
from deltacat.tests.utils.exceptions import (
|
15
15
|
InvalidNamespaceError,
|
16
|
-
|
16
|
+
MainStorageValidationError,
|
17
17
|
)
|
18
|
+
from deltacat.tests.utils import main_deltacat_storage_mock as ds
|
18
19
|
from botocore.exceptions import NoCredentialsError
|
19
20
|
from tenacity import retry, retry_if_exception_type, stop_after_attempt
|
20
21
|
|
21
22
|
from pyarrow.lib import ArrowCapacityError
|
22
|
-
import deltacat.tests.local_deltacat_storage as ds
|
23
23
|
|
24
24
|
|
25
25
|
class MockUnknownException(Exception):
|
@@ -41,7 +41,7 @@ def mock_remote_task(exception_to_raise):
|
|
41
41
|
mock_raise_exception(exception_to_raise)
|
42
42
|
|
43
43
|
|
44
|
-
class
|
44
|
+
class TestCategorizeErrorsMain(unittest.TestCase):
|
45
45
|
def test_pyarrow_exception_categorizer(self):
|
46
46
|
self.assertRaises(
|
47
47
|
DependencyPyarrowCapacityError,
|
@@ -50,7 +50,7 @@ class TestCategorizeErrors(unittest.TestCase):
|
|
50
50
|
|
51
51
|
def test_storage_exception_categorizer(self):
|
52
52
|
self.assertRaises(
|
53
|
-
|
53
|
+
MainStorageValidationError,
|
54
54
|
lambda: mock_raise_exception(InvalidNamespaceError, deltacat_storage=ds),
|
55
55
|
)
|
56
56
|
|
@@ -98,3 +98,7 @@ class TestCategorizeErrors(unittest.TestCase):
|
|
98
98
|
return
|
99
99
|
|
100
100
|
self.assertFalse(True)
|
101
|
+
|
102
|
+
|
103
|
+
if __name__ == "__main__":
|
104
|
+
unittest.main()
|
@@ -1,8 +1,9 @@
|
|
1
1
|
from typing import List, Optional, Union
|
2
2
|
import pyarrow as pa
|
3
3
|
from deltacat.storage import Delta, Partition, PartitionLocator, DeltaLocator
|
4
|
-
|
4
|
+
from deltacat.storage import metastore
|
5
5
|
from deltacat.types.media import StorageType, ContentType
|
6
|
+
from deltacat.storage.model.schema import Schema
|
6
7
|
|
7
8
|
|
8
9
|
def create_delta_from_csv_file(
|
@@ -14,58 +15,89 @@ def create_delta_from_csv_file(
|
|
14
15
|
*args,
|
15
16
|
**kwargs,
|
16
17
|
) -> Delta:
|
18
|
+
assert file_paths is not None, "file_paths cannot be empty"
|
19
|
+
pa_table = create_table_from_csv_file_paths(file_paths)
|
20
|
+
schema = Schema.of(pa_table.schema)
|
17
21
|
staged_partition = stage_partition_from_file_paths(
|
18
22
|
namespace,
|
19
23
|
file_paths,
|
24
|
+
schema,
|
20
25
|
*args,
|
21
26
|
table_name=table_name,
|
22
27
|
table_version=table_version,
|
23
28
|
**kwargs,
|
24
29
|
)
|
25
30
|
committed_delta = commit_delta_to_staged_partition(
|
26
|
-
staged_partition,
|
31
|
+
staged_partition,
|
32
|
+
pa_table,
|
33
|
+
content_type,
|
34
|
+
*args,
|
35
|
+
**kwargs,
|
27
36
|
)
|
28
37
|
return committed_delta
|
29
38
|
|
30
39
|
|
40
|
+
def create_table_from_csv_file_paths(
|
41
|
+
file_paths: List[str],
|
42
|
+
) -> pa.Table:
|
43
|
+
tables = []
|
44
|
+
for file_path in file_paths:
|
45
|
+
table = pa.csv.read_csv(file_path)
|
46
|
+
tables.append(table)
|
47
|
+
return pa.concat_tables(tables)
|
48
|
+
|
49
|
+
|
31
50
|
def stage_partition_from_file_paths(
|
32
51
|
namespace: str,
|
33
52
|
file_paths: List[str],
|
53
|
+
schema: Schema,
|
34
54
|
table_name: Optional[str] = None,
|
35
55
|
table_version: int = 1,
|
36
56
|
*args,
|
37
57
|
**kwargs,
|
38
58
|
) -> Partition:
|
39
|
-
|
59
|
+
if not metastore.namespace_exists(namespace, **kwargs):
|
60
|
+
metastore.create_namespace(namespace, **kwargs)
|
40
61
|
if table_name is None:
|
41
62
|
table_name = "-".join(file_paths).replace("/", "_")
|
42
|
-
|
43
|
-
|
44
|
-
|
63
|
+
metastore.create_table_version(
|
64
|
+
namespace,
|
65
|
+
table_name,
|
66
|
+
str(table_version),
|
67
|
+
schema=schema,
|
68
|
+
**kwargs,
|
69
|
+
)
|
70
|
+
stream = metastore.get_stream(
|
71
|
+
namespace,
|
72
|
+
table_name,
|
73
|
+
str(table_version),
|
74
|
+
**kwargs,
|
75
|
+
)
|
76
|
+
staged_partition = metastore.stage_partition(stream, **kwargs)
|
45
77
|
return staged_partition
|
46
78
|
|
47
79
|
|
48
80
|
def commit_delta_to_staged_partition(
|
49
81
|
staged_partition,
|
50
|
-
|
82
|
+
pa_table: pa.Table,
|
51
83
|
content_type: ContentType = ContentType.PARQUET,
|
52
84
|
*args,
|
53
85
|
**kwargs,
|
54
86
|
) -> Delta:
|
55
87
|
committed_delta = commit_delta_to_partition(
|
56
88
|
staged_partition,
|
89
|
+
pa_table,
|
90
|
+
content_type,
|
57
91
|
*args,
|
58
|
-
file_paths=file_paths,
|
59
|
-
content_type=content_type,
|
60
92
|
**kwargs,
|
61
93
|
)
|
62
|
-
|
94
|
+
metastore.commit_partition(staged_partition, **kwargs)
|
63
95
|
return committed_delta
|
64
96
|
|
65
97
|
|
66
98
|
def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> Delta:
|
67
99
|
return pa.concat_tables(
|
68
|
-
|
100
|
+
metastore.download_delta(
|
69
101
|
delta_like,
|
70
102
|
storage_type=StorageType.LOCAL,
|
71
103
|
*args,
|
@@ -76,23 +108,22 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
|
|
76
108
|
|
77
109
|
def commit_delta_to_partition(
|
78
110
|
partition: Union[Partition, PartitionLocator],
|
79
|
-
|
111
|
+
pa_table: pa.Table = None,
|
80
112
|
content_type: ContentType = ContentType.PARQUET,
|
81
113
|
*args,
|
82
114
|
**kwargs,
|
83
115
|
) -> Delta:
|
84
|
-
tables = []
|
85
116
|
|
86
117
|
if isinstance(partition, PartitionLocator):
|
87
|
-
partition =
|
118
|
+
partition = metastore.get_partition(
|
88
119
|
partition.stream_locator, partition.partition_values, *args, **kwargs
|
89
120
|
)
|
90
121
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
122
|
+
staged_delta = metastore.stage_delta(
|
123
|
+
pa_table,
|
124
|
+
partition,
|
125
|
+
content_type=content_type,
|
126
|
+
**kwargs,
|
127
|
+
)
|
97
128
|
|
98
|
-
return
|
129
|
+
return metastore.commit_delta(staged_delta, **kwargs)
|
@@ -25,12 +25,16 @@ from deltacat.storage import (
|
|
25
25
|
NullOrder,
|
26
26
|
Partition,
|
27
27
|
PartitionKey,
|
28
|
+
PartitionKeyList,
|
28
29
|
PartitionLocator,
|
29
30
|
PartitionScheme,
|
31
|
+
PartitionSchemeList,
|
30
32
|
Schema,
|
31
33
|
SchemaList,
|
32
34
|
SortScheme,
|
35
|
+
SortSchemeList,
|
33
36
|
SortKey,
|
37
|
+
SortKeyList,
|
34
38
|
SortOrder,
|
35
39
|
StreamLocator,
|
36
40
|
StreamFormat,
|
@@ -59,7 +63,10 @@ def create_empty_delta(
|
|
59
63
|
manifest_entry_id: Optional[str] = None,
|
60
64
|
) -> Delta:
|
61
65
|
stream_position = current_time_ms()
|
62
|
-
delta_locator = DeltaLocator.of(
|
66
|
+
delta_locator = DeltaLocator.of(
|
67
|
+
partition.locator,
|
68
|
+
stream_position=stream_position,
|
69
|
+
)
|
63
70
|
|
64
71
|
if manifest_entry_id:
|
65
72
|
manifest = Manifest.of(
|
@@ -131,12 +138,12 @@ def create_test_table_version():
|
|
131
138
|
PartitionKey.of(
|
132
139
|
key=["some_string", "some_int32"],
|
133
140
|
name="test_partition_key",
|
134
|
-
field_id=
|
141
|
+
field_id=1,
|
135
142
|
transform=bucket_transform,
|
136
143
|
)
|
137
144
|
]
|
138
145
|
partition_scheme = PartitionScheme.of(
|
139
|
-
keys=partition_keys,
|
146
|
+
keys=PartitionKeyList.of(partition_keys),
|
140
147
|
name="test_partition_scheme",
|
141
148
|
scheme_id="test_partition_scheme_id",
|
142
149
|
)
|
@@ -151,7 +158,7 @@ def create_test_table_version():
|
|
151
158
|
)
|
152
159
|
]
|
153
160
|
sort_scheme = SortScheme.of(
|
154
|
-
keys=sort_keys,
|
161
|
+
keys=SortKeyList.of(sort_keys),
|
155
162
|
name="test_sort_scheme",
|
156
163
|
scheme_id="test_sort_scheme_id",
|
157
164
|
)
|
@@ -166,8 +173,8 @@ def create_test_table_version():
|
|
166
173
|
watermark=None,
|
167
174
|
lifecycle_state=LifecycleState.CREATED,
|
168
175
|
schemas=SchemaList.of([schema]),
|
169
|
-
partition_schemes=[partition_scheme],
|
170
|
-
sort_schemes=[sort_scheme],
|
176
|
+
partition_schemes=PartitionSchemeList.of([partition_scheme]),
|
177
|
+
sort_schemes=SortSchemeList.of([sort_scheme]),
|
171
178
|
)
|
172
179
|
|
173
180
|
|
@@ -189,12 +196,12 @@ def create_test_stream():
|
|
189
196
|
PartitionKey.of(
|
190
197
|
key=["some_string", "some_int32"],
|
191
198
|
name="test_partition_key",
|
192
|
-
field_id=
|
199
|
+
field_id=1,
|
193
200
|
transform=bucket_transform,
|
194
201
|
)
|
195
202
|
]
|
196
203
|
partition_scheme = PartitionScheme.of(
|
197
|
-
keys=partition_keys,
|
204
|
+
keys=PartitionKeyList.of(partition_keys),
|
198
205
|
name="test_partition_scheme",
|
199
206
|
scheme_id="test_partition_scheme_id",
|
200
207
|
)
|
@@ -217,28 +224,8 @@ def create_test_partition():
|
|
217
224
|
partition_values=["a", 1],
|
218
225
|
partition_id="test_partition_id",
|
219
226
|
)
|
220
|
-
schema = Schema.of(
|
221
|
-
[
|
222
|
-
Field.of(
|
223
|
-
field=pa.field("some_string", pa.string(), nullable=False),
|
224
|
-
field_id=1,
|
225
|
-
is_merge_key=True,
|
226
|
-
),
|
227
|
-
Field.of(
|
228
|
-
field=pa.field("some_int32", pa.int32(), nullable=False),
|
229
|
-
field_id=2,
|
230
|
-
is_merge_key=True,
|
231
|
-
),
|
232
|
-
Field.of(
|
233
|
-
field=pa.field("some_float64", pa.float64()),
|
234
|
-
field_id=3,
|
235
|
-
is_merge_key=False,
|
236
|
-
),
|
237
|
-
]
|
238
|
-
)
|
239
227
|
return Partition.of(
|
240
228
|
locator=partition_locator,
|
241
|
-
schema=schema,
|
242
229
|
content_types=[ContentType.PARQUET],
|
243
230
|
state=CommitState.STAGED,
|
244
231
|
previous_stream_position=0,
|
@@ -274,12 +261,14 @@ def create_test_delta():
|
|
274
261
|
entry_params=manifest_entry_params,
|
275
262
|
)
|
276
263
|
manifest = Manifest.of(
|
277
|
-
entries=
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
264
|
+
entries=ManifestEntryList(
|
265
|
+
[
|
266
|
+
ManifestEntry.of(
|
267
|
+
url="s3://test/url",
|
268
|
+
meta=manifest_meta,
|
269
|
+
)
|
270
|
+
]
|
271
|
+
),
|
283
272
|
author=ManifestAuthor.of(
|
284
273
|
name="deltacat",
|
285
274
|
version="2.0",
|
File without changes
|
@@ -0,0 +1,104 @@
|
|
1
|
+
import pytest
|
2
|
+
import pandas as pd
|
3
|
+
import pyarrow as pa
|
4
|
+
|
5
|
+
from deltacat.types.tables import (
|
6
|
+
to_pandas,
|
7
|
+
to_pyarrow,
|
8
|
+
get_table_length,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
def test_convert_to_pandas_error_cases():
|
13
|
+
"""Test convert_to_pandas with invalid inputs."""
|
14
|
+
# Test None input
|
15
|
+
with pytest.raises(
|
16
|
+
ValueError, match="No pandas conversion function found for table type"
|
17
|
+
):
|
18
|
+
to_pandas(None)
|
19
|
+
|
20
|
+
# Test unsupported type
|
21
|
+
with pytest.raises(
|
22
|
+
ValueError, match="No pandas conversion function found for table type"
|
23
|
+
):
|
24
|
+
to_pandas("invalid_string")
|
25
|
+
|
26
|
+
# Test unsupported type with complex object
|
27
|
+
with pytest.raises(
|
28
|
+
ValueError, match="No pandas conversion function found for table type"
|
29
|
+
):
|
30
|
+
to_pandas({"not": "a_dataframe"})
|
31
|
+
|
32
|
+
|
33
|
+
def test_convert_to_arrow_error_cases():
|
34
|
+
"""Test convert_to_arrow with invalid inputs."""
|
35
|
+
# Test None input
|
36
|
+
with pytest.raises(
|
37
|
+
ValueError, match="No pyarrow conversion function found for table type"
|
38
|
+
):
|
39
|
+
to_pyarrow(None)
|
40
|
+
|
41
|
+
# Test unsupported type
|
42
|
+
with pytest.raises(
|
43
|
+
ValueError, match="No pyarrow conversion function found for table type"
|
44
|
+
):
|
45
|
+
to_pyarrow("invalid_string")
|
46
|
+
|
47
|
+
# Test unsupported type with complex object
|
48
|
+
with pytest.raises(
|
49
|
+
ValueError, match="No pyarrow conversion function found for table type"
|
50
|
+
):
|
51
|
+
to_pyarrow({"not": "a_table"})
|
52
|
+
|
53
|
+
|
54
|
+
def test_conversion_functions_with_real_data():
|
55
|
+
"""Test conversion functions with actual data structures."""
|
56
|
+
# Create test data
|
57
|
+
test_df = pd.DataFrame({"id": [1, 2], "name": ["test1", "test2"]})
|
58
|
+
test_table = pa.Table.from_pandas(test_df)
|
59
|
+
|
60
|
+
# Test pandas conversion
|
61
|
+
converted_df = to_pandas(test_df)
|
62
|
+
assert isinstance(converted_df, pd.DataFrame)
|
63
|
+
assert converted_df.equals(test_df)
|
64
|
+
|
65
|
+
# Test arrow conversion
|
66
|
+
converted_table = to_pyarrow(test_table)
|
67
|
+
assert isinstance(converted_table, pa.Table)
|
68
|
+
assert converted_table.equals(test_table)
|
69
|
+
|
70
|
+
# Test cross-conversion
|
71
|
+
df_from_table = to_pandas(test_table)
|
72
|
+
table_from_df = to_pyarrow(test_df)
|
73
|
+
assert isinstance(df_from_table, pd.DataFrame)
|
74
|
+
assert isinstance(table_from_df, pa.Table)
|
75
|
+
|
76
|
+
|
77
|
+
def test_conversion_roundtrip_consistency():
|
78
|
+
"""Test that conversion functions maintain data integrity through roundtrips."""
|
79
|
+
# Create test data
|
80
|
+
original_df = pd.DataFrame(
|
81
|
+
{
|
82
|
+
"id": [1, 2, 3, 4, 5],
|
83
|
+
"name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
|
84
|
+
"age": [25, 30, 35, 40, 45],
|
85
|
+
"city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
|
86
|
+
}
|
87
|
+
)
|
88
|
+
|
89
|
+
# Test pandas -> arrow -> pandas roundtrip
|
90
|
+
arrow_table = to_pyarrow(original_df)
|
91
|
+
roundtrip_df = to_pandas(arrow_table)
|
92
|
+
|
93
|
+
# Verify data integrity (allowing for potential type changes)
|
94
|
+
assert get_table_length(original_df) == get_table_length(
|
95
|
+
roundtrip_df
|
96
|
+
), "Row count should be preserved"
|
97
|
+
assert list(original_df.columns) == list(
|
98
|
+
roundtrip_df.columns
|
99
|
+
), "Column names should be preserved"
|
100
|
+
|
101
|
+
# Verify ID column integrity (critical for merge operations)
|
102
|
+
original_ids = sorted(original_df["id"].tolist())
|
103
|
+
roundtrip_ids = sorted(roundtrip_df["id"].tolist())
|
104
|
+
assert original_ids == roundtrip_ids, "ID column should be preserved exactly"
|
@@ -0,0 +1,22 @@
|
|
1
|
+
"""
|
2
|
+
Exception classes for main storage testing that mirror the local storage exceptions.
|
3
|
+
These are used to test the main metastore error categorization functionality.
|
4
|
+
"""
|
5
|
+
|
6
|
+
|
7
|
+
class InvalidNamespaceError(Exception):
|
8
|
+
"""Exception raised when an invalid namespace is provided to main storage."""
|
9
|
+
|
10
|
+
error_name = "InvalidNamespaceError"
|
11
|
+
|
12
|
+
|
13
|
+
class MainStorageValidationError(Exception):
|
14
|
+
"""Exception raised when main storage validation fails."""
|
15
|
+
|
16
|
+
error_name = "MainStorageValidationError"
|
17
|
+
|
18
|
+
|
19
|
+
class MainStorageError(Exception):
|
20
|
+
"""General exception for main storage operations."""
|
21
|
+
|
22
|
+
error_name = "MainStorageError"
|
@@ -0,0 +1,31 @@
|
|
1
|
+
"""
|
2
|
+
Mock module that provides storage-specific error categorization functions for main storage testing.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from deltacat.tests.utils.exceptions import (
|
6
|
+
InvalidNamespaceError,
|
7
|
+
MainStorageValidationError,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
def can_categorize(e: BaseException, **kwargs) -> bool:
|
12
|
+
"""
|
13
|
+
Mock implementation of can_categorize for main storage testing.
|
14
|
+
Returns True if the input error can be categorized by main storage.
|
15
|
+
"""
|
16
|
+
if isinstance(e, InvalidNamespaceError):
|
17
|
+
return True
|
18
|
+
else:
|
19
|
+
return False
|
20
|
+
|
21
|
+
|
22
|
+
def raise_categorized_error(e: BaseException, **kwargs):
|
23
|
+
"""
|
24
|
+
Mock implementation of raise_categorized_error for main storage testing.
|
25
|
+
Converts categorizable errors to their main storage equivalent.
|
26
|
+
"""
|
27
|
+
if isinstance(e, InvalidNamespaceError):
|
28
|
+
raise MainStorageValidationError("Namespace provided is invalid!")
|
29
|
+
else:
|
30
|
+
# If we can't categorize it, re-raise the original exception
|
31
|
+
raise e
|
@@ -6,6 +6,8 @@ from fsspec import AbstractFileSystem
|
|
6
6
|
from ray.data.datasource import FilenameProvider
|
7
7
|
from deltacat.types.media import ContentType
|
8
8
|
import ray
|
9
|
+
import gzip
|
10
|
+
import json
|
9
11
|
|
10
12
|
|
11
13
|
class TestDatasetToFile:
|
@@ -20,7 +22,13 @@ class TestDatasetToFile:
|
|
20
22
|
|
21
23
|
@pytest.fixture(scope="module")
|
22
24
|
def mock_dataset(self):
|
23
|
-
|
25
|
+
# Include data that would need escaping to test quoting behavior
|
26
|
+
return from_items([{"col1": "a,b\tc|d", "col2": 0} for _ in range(5)])
|
27
|
+
|
28
|
+
@pytest.fixture(scope="module")
|
29
|
+
def mock_unescaped_dataset(self):
|
30
|
+
# Use data without delimiters for unescaped TSV test
|
31
|
+
return from_items([{"col1": "abc", "col2": 0} for _ in range(5)])
|
24
32
|
|
25
33
|
@pytest.fixture(scope="module")
|
26
34
|
def mock_filename_provider(self):
|
@@ -35,12 +43,12 @@ class TestDatasetToFile:
|
|
35
43
|
def test_parquet_sanity(self, mock_dataset, mock_filename_provider):
|
36
44
|
from deltacat.utils.ray_utils.dataset import dataset_to_file
|
37
45
|
|
38
|
-
fs: AbstractFileSystem = fsspec.filesystem("
|
46
|
+
fs: AbstractFileSystem = fsspec.filesystem("file")
|
39
47
|
|
40
48
|
dataset_to_file(
|
41
49
|
mock_dataset,
|
42
50
|
self.BASE_PATH,
|
43
|
-
|
51
|
+
filesystem=fs,
|
44
52
|
block_path_provider=mock_filename_provider,
|
45
53
|
)
|
46
54
|
|
@@ -51,16 +59,126 @@ class TestDatasetToFile:
|
|
51
59
|
def test_csv_sanity(self, mock_dataset, mock_filename_provider):
|
52
60
|
from deltacat.utils.ray_utils.dataset import dataset_to_file
|
53
61
|
|
54
|
-
fs: AbstractFileSystem = fsspec.filesystem("
|
62
|
+
fs: AbstractFileSystem = fsspec.filesystem("file")
|
55
63
|
|
56
64
|
dataset_to_file(
|
57
65
|
mock_dataset,
|
58
66
|
self.BASE_PATH,
|
59
|
-
|
67
|
+
filesystem=fs,
|
60
68
|
block_path_provider=mock_filename_provider,
|
61
69
|
content_type=ContentType.CSV.value,
|
62
70
|
)
|
63
71
|
|
64
72
|
file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
|
65
73
|
assert fs.exists(file_expected_at), "file was not written"
|
74
|
+
|
75
|
+
# Verify CSV format and content
|
76
|
+
with fs.open(file_expected_at, "rb") as f:
|
77
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
78
|
+
content = gz.read().decode("utf-8")
|
79
|
+
# Should be quoted due to commas in data
|
80
|
+
assert '"a,b\tc|d",0' in content
|
81
|
+
|
82
|
+
fs.delete(file_expected_at)
|
83
|
+
|
84
|
+
def test_tsv_sanity(self, mock_dataset, mock_filename_provider):
|
85
|
+
from deltacat.utils.ray_utils.dataset import dataset_to_file
|
86
|
+
|
87
|
+
fs: AbstractFileSystem = fsspec.filesystem("file")
|
88
|
+
|
89
|
+
dataset_to_file(
|
90
|
+
mock_dataset,
|
91
|
+
self.BASE_PATH,
|
92
|
+
filesystem=fs,
|
93
|
+
block_path_provider=mock_filename_provider,
|
94
|
+
content_type=ContentType.TSV.value,
|
95
|
+
)
|
96
|
+
|
97
|
+
file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
|
98
|
+
assert fs.exists(file_expected_at), "file was not written"
|
99
|
+
|
100
|
+
# Verify TSV format and content
|
101
|
+
with fs.open(file_expected_at, "rb") as f:
|
102
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
103
|
+
content = gz.read().decode("utf-8")
|
104
|
+
# Should be quoted due to tabs in data
|
105
|
+
assert '"a,b\tc|d"\t0' in content
|
106
|
+
|
107
|
+
fs.delete(file_expected_at)
|
108
|
+
|
109
|
+
def test_psv_sanity(self, mock_dataset, mock_filename_provider):
|
110
|
+
from deltacat.utils.ray_utils.dataset import dataset_to_file
|
111
|
+
|
112
|
+
fs: AbstractFileSystem = fsspec.filesystem("file")
|
113
|
+
|
114
|
+
dataset_to_file(
|
115
|
+
mock_dataset,
|
116
|
+
self.BASE_PATH,
|
117
|
+
filesystem=fs,
|
118
|
+
block_path_provider=mock_filename_provider,
|
119
|
+
content_type=ContentType.PSV.value,
|
120
|
+
)
|
121
|
+
|
122
|
+
file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
|
123
|
+
assert fs.exists(file_expected_at), "file was not written"
|
124
|
+
|
125
|
+
# Verify PSV format and content
|
126
|
+
with fs.open(file_expected_at, "rb") as f:
|
127
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
128
|
+
content = gz.read().decode("utf-8")
|
129
|
+
# Should be quoted due to pipes in data
|
130
|
+
assert '"a,b\tc|d"|0' in content
|
131
|
+
|
132
|
+
fs.delete(file_expected_at)
|
133
|
+
|
134
|
+
def test_unescaped_tsv_sanity(self, mock_unescaped_dataset, mock_filename_provider):
|
135
|
+
from deltacat.utils.ray_utils.dataset import dataset_to_file
|
136
|
+
|
137
|
+
fs: AbstractFileSystem = fsspec.filesystem("file")
|
138
|
+
|
139
|
+
dataset_to_file(
|
140
|
+
mock_unescaped_dataset,
|
141
|
+
self.BASE_PATH,
|
142
|
+
filesystem=fs,
|
143
|
+
block_path_provider=mock_filename_provider,
|
144
|
+
content_type=ContentType.UNESCAPED_TSV.value,
|
145
|
+
)
|
146
|
+
|
147
|
+
file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
|
148
|
+
assert fs.exists(file_expected_at), "file was not written"
|
149
|
+
|
150
|
+
# Verify UNESCAPED_TSV format and content
|
151
|
+
with fs.open(file_expected_at, "rb") as f:
|
152
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
153
|
+
content = gz.read().decode("utf-8")
|
154
|
+
# Should NOT be quoted since data has no delimiters
|
155
|
+
assert "abc\t0" in content
|
156
|
+
|
157
|
+
fs.delete(file_expected_at)
|
158
|
+
|
159
|
+
def test_json_sanity(self, mock_dataset, mock_filename_provider):
|
160
|
+
from deltacat.utils.ray_utils.dataset import dataset_to_file
|
161
|
+
|
162
|
+
fs: AbstractFileSystem = fsspec.filesystem("file")
|
163
|
+
|
164
|
+
dataset_to_file(
|
165
|
+
mock_dataset,
|
166
|
+
self.BASE_PATH,
|
167
|
+
filesystem=fs,
|
168
|
+
block_path_provider=mock_filename_provider,
|
169
|
+
content_type=ContentType.JSON.value,
|
170
|
+
)
|
171
|
+
|
172
|
+
file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
|
173
|
+
assert fs.exists(file_expected_at), "file was not written"
|
174
|
+
|
175
|
+
# Verify JSON format and content
|
176
|
+
with fs.open(file_expected_at, "rb") as f:
|
177
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
178
|
+
content = gz.read().decode("utf-8")
|
179
|
+
# Each line should be a valid JSON object
|
180
|
+
first_line = content.split("\n")[0]
|
181
|
+
record = json.loads(first_line)
|
182
|
+
assert record == {"col1": "a,b\tc|d", "col2": 0}
|
183
|
+
|
66
184
|
fs.delete(file_expected_at)
|