deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
|
|
1
1
|
import unittest
|
2
2
|
from deltacat.types.media import ContentEncoding, ContentType
|
3
|
-
from deltacat.utils.daft import
|
3
|
+
from deltacat.utils.daft import (
|
4
|
+
daft_file_to_pyarrow_table,
|
5
|
+
files_to_dataframe,
|
6
|
+
)
|
4
7
|
from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
|
5
8
|
from deltacat.types.partial_download import PartialParquetParameters
|
6
9
|
import pyarrow as pa
|
@@ -8,11 +11,11 @@ import pyarrow as pa
|
|
8
11
|
from pyarrow import parquet as pq
|
9
12
|
|
10
13
|
|
11
|
-
class
|
14
|
+
class TestDaftFileToPyarrowTable(unittest.TestCase):
|
12
15
|
MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
|
13
16
|
|
14
|
-
def
|
15
|
-
table =
|
17
|
+
def test_read_from_local_all_columns(self):
|
18
|
+
table = daft_file_to_pyarrow_table(
|
16
19
|
self.MVP_PATH,
|
17
20
|
content_encoding=ContentEncoding.IDENTITY.value,
|
18
21
|
content_type=ContentType.PARQUET.value,
|
@@ -20,8 +23,8 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
20
23
|
self.assertEqual(table.schema.names, ["a", "b"])
|
21
24
|
self.assertEqual(table.num_rows, 100)
|
22
25
|
|
23
|
-
def
|
24
|
-
table =
|
26
|
+
def test_read_from_local_single_column_via_include_columns(self):
|
27
|
+
table = daft_file_to_pyarrow_table(
|
25
28
|
self.MVP_PATH,
|
26
29
|
content_encoding=ContentEncoding.IDENTITY.value,
|
27
30
|
content_type=ContentType.PARQUET.value,
|
@@ -30,8 +33,8 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
30
33
|
self.assertEqual(table.schema.names, ["b"])
|
31
34
|
self.assertEqual(table.num_rows, 100)
|
32
35
|
|
33
|
-
def
|
34
|
-
table =
|
36
|
+
def test_read_from_local_single_column_via_column_names(self):
|
37
|
+
table = daft_file_to_pyarrow_table(
|
35
38
|
self.MVP_PATH,
|
36
39
|
content_encoding=ContentEncoding.IDENTITY.value,
|
37
40
|
content_type=ContentType.PARQUET.value,
|
@@ -40,12 +43,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
40
43
|
self.assertEqual(table.schema.names, ["b"])
|
41
44
|
self.assertEqual(table.num_rows, 100)
|
42
45
|
|
43
|
-
def
|
46
|
+
def test_read_from_local_single_column_with_schema(self):
|
44
47
|
schema = pa.schema([("a", pa.int8()), ("b", pa.string())])
|
45
48
|
pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
46
49
|
schema=schema
|
47
50
|
)
|
48
|
-
table =
|
51
|
+
table = daft_file_to_pyarrow_table(
|
49
52
|
self.MVP_PATH,
|
50
53
|
content_encoding=ContentEncoding.IDENTITY.value,
|
51
54
|
content_type=ContentType.PARQUET.value,
|
@@ -56,12 +59,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
56
59
|
self.assertEqual(table.schema.field("a").type, pa.int8())
|
57
60
|
self.assertEqual(table.num_rows, 100)
|
58
61
|
|
59
|
-
def
|
62
|
+
def test_read_from_local_single_column_with_schema_reverse_order(self):
|
60
63
|
schema = pa.schema([("b", pa.string()), ("a", pa.int8())])
|
61
64
|
pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
62
65
|
schema=schema
|
63
66
|
)
|
64
|
-
table =
|
67
|
+
table = daft_file_to_pyarrow_table(
|
65
68
|
self.MVP_PATH,
|
66
69
|
content_encoding=ContentEncoding.IDENTITY.value,
|
67
70
|
content_type=ContentType.PARQUET.value,
|
@@ -71,12 +74,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
71
74
|
self.assertEqual(table.schema.field("a").type, pa.int8())
|
72
75
|
self.assertEqual(table.num_rows, 100)
|
73
76
|
|
74
|
-
def
|
77
|
+
def test_read_from_local_single_column_with_schema_subset_cols(self):
|
75
78
|
schema = pa.schema([("a", pa.int8())])
|
76
79
|
pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
77
80
|
schema=schema
|
78
81
|
)
|
79
|
-
table =
|
82
|
+
table = daft_file_to_pyarrow_table(
|
80
83
|
self.MVP_PATH,
|
81
84
|
content_encoding=ContentEncoding.IDENTITY.value,
|
82
85
|
content_type=ContentType.PARQUET.value,
|
@@ -86,12 +89,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
86
89
|
self.assertEqual(table.schema.field("a").type, pa.int8())
|
87
90
|
self.assertEqual(table.num_rows, 100)
|
88
91
|
|
89
|
-
def
|
92
|
+
def test_read_from_local_single_column_with_schema_extra_cols(self):
|
90
93
|
schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
|
91
94
|
pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
92
95
|
schema=schema
|
93
96
|
)
|
94
|
-
table =
|
97
|
+
table = daft_file_to_pyarrow_table(
|
95
98
|
self.MVP_PATH,
|
96
99
|
content_encoding=ContentEncoding.IDENTITY.value,
|
97
100
|
content_type=ContentType.PARQUET.value,
|
@@ -104,12 +107,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
104
107
|
self.assertEqual(table.schema.field("MISSING").type, pa.string())
|
105
108
|
self.assertEqual(table.num_rows, 100)
|
106
109
|
|
107
|
-
def
|
110
|
+
def test_read_from_local_single_column_with_schema_extra_cols_column_names(self):
|
108
111
|
schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
|
109
112
|
pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
110
113
|
schema=schema
|
111
114
|
)
|
112
|
-
table =
|
115
|
+
table = daft_file_to_pyarrow_table(
|
113
116
|
self.MVP_PATH,
|
114
117
|
content_encoding=ContentEncoding.IDENTITY.value,
|
115
118
|
content_type=ContentType.PARQUET.value,
|
@@ -123,12 +126,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
123
126
|
self.assertEqual(table.schema.field("MISSING").type, pa.string())
|
124
127
|
self.assertEqual(table.num_rows, 100)
|
125
128
|
|
126
|
-
def
|
129
|
+
def test_read_from_local_single_column_with_schema_only_missing_col(self):
|
127
130
|
schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
|
128
131
|
pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
129
132
|
schema=schema
|
130
133
|
)
|
131
|
-
table =
|
134
|
+
table = daft_file_to_pyarrow_table(
|
132
135
|
self.MVP_PATH,
|
133
136
|
content_encoding=ContentEncoding.IDENTITY.value,
|
134
137
|
content_type=ContentType.PARQUET.value,
|
@@ -142,12 +145,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
142
145
|
self.assertEqual(table.schema.field("MISSING").type, pa.string())
|
143
146
|
self.assertEqual(table.num_rows, 0)
|
144
147
|
|
145
|
-
def
|
148
|
+
def test_read_from_local_single_column_with_row_groups(self):
|
146
149
|
|
147
150
|
metadata = pq.read_metadata(self.MVP_PATH)
|
148
151
|
ppp = PartialParquetParameters.of(pq_metadata=metadata)
|
149
152
|
ppp["row_groups_to_download"] = ppp.row_groups_to_download[1:2]
|
150
|
-
table =
|
153
|
+
table = daft_file_to_pyarrow_table(
|
151
154
|
self.MVP_PATH,
|
152
155
|
content_encoding=ContentEncoding.IDENTITY.value,
|
153
156
|
content_type=ContentType.PARQUET.value,
|
@@ -158,11 +161,11 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
158
161
|
self.assertEqual(table.num_rows, 10)
|
159
162
|
|
160
163
|
|
161
|
-
class
|
164
|
+
class TestFilesToDataFrame(unittest.TestCase):
|
162
165
|
MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
|
163
166
|
|
164
|
-
def
|
165
|
-
df =
|
167
|
+
def test_read_local_files_all_columns(self):
|
168
|
+
df = files_to_dataframe(
|
166
169
|
uris=[self.MVP_PATH],
|
167
170
|
content_encoding=ContentEncoding.IDENTITY.value,
|
168
171
|
content_type=ContentType.PARQUET.value,
|
@@ -173,30 +176,117 @@ class TestDaftS3FilesToDataFrame(unittest.TestCase):
|
|
173
176
|
self.assertEqual(table.schema.names, ["a", "b"])
|
174
177
|
self.assertEqual(table.num_rows, 100)
|
175
178
|
|
176
|
-
def
|
177
|
-
df =
|
179
|
+
def test_read_local_files_with_column_selection(self):
|
180
|
+
df = files_to_dataframe(
|
178
181
|
uris=[self.MVP_PATH],
|
179
182
|
content_encoding=ContentEncoding.IDENTITY.value,
|
180
183
|
content_type=ContentType.PARQUET.value,
|
184
|
+
include_columns=["b"],
|
181
185
|
ray_init_options={"local_mode": True, "ignore_reinit_error": True},
|
182
186
|
)
|
183
187
|
|
188
|
+
table = df.to_arrow()
|
189
|
+
self.assertEqual(table.schema.names, ["b"])
|
190
|
+
self.assertEqual(table.num_rows, 100)
|
191
|
+
|
192
|
+
def test_read_local_files_does_not_materialize_by_default(self):
|
193
|
+
df = files_to_dataframe(
|
194
|
+
uris=[self.MVP_PATH],
|
195
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
196
|
+
content_type=ContentType.PARQUET.value,
|
197
|
+
ray_init_options={"local_mode": True, "ignore_reinit_error": True},
|
198
|
+
)
|
199
|
+
|
200
|
+
# Should raise RuntimeError because df is not materialized yet
|
184
201
|
self.assertRaises(RuntimeError, lambda: len(df))
|
202
|
+
|
203
|
+
# After collecting, it should work
|
185
204
|
df.collect()
|
186
205
|
self.assertEqual(len(df), 100)
|
187
206
|
|
188
|
-
def
|
207
|
+
def test_supports_unescaped_tsv_content_type(self):
|
208
|
+
# Test that UNESCAPED_TSV is now supported (was previously unsupported)
|
209
|
+
# Use a CSV file since we're testing TSV reader functionality
|
210
|
+
csv_path = "deltacat/tests/utils/data/non_empty_valid.csv"
|
211
|
+
df = files_to_dataframe(
|
212
|
+
uris=[csv_path],
|
213
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
214
|
+
content_type=ContentType.UNESCAPED_TSV.value,
|
215
|
+
ray_init_options={"local_mode": True, "ignore_reinit_error": True},
|
216
|
+
)
|
217
|
+
# Should succeed without raising an exception - this tests that UNESCAPED_TSV is supported
|
218
|
+
table = df.to_arrow()
|
219
|
+
# Just verify we got some data back, don't assert specific schema since we're reading CSV as TSV
|
220
|
+
self.assertGreater(table.num_rows, 0)
|
221
|
+
self.assertGreater(len(table.schema.names), 0)
|
189
222
|
|
223
|
+
def test_supports_gzip_content_encoding(self):
|
224
|
+
# Test that GZIP encoding is now supported (was previously unsupported)
|
225
|
+
df = files_to_dataframe(
|
226
|
+
uris=[self.MVP_PATH],
|
227
|
+
content_encoding=ContentEncoding.GZIP.value,
|
228
|
+
content_type=ContentType.PARQUET.value,
|
229
|
+
ray_init_options={"local_mode": True, "ignore_reinit_error": True},
|
230
|
+
)
|
231
|
+
# Should succeed without raising an exception
|
232
|
+
table = df.to_arrow()
|
233
|
+
self.assertEqual(table.schema.names, ["a", "b"])
|
234
|
+
self.assertEqual(table.num_rows, 100)
|
235
|
+
|
236
|
+
def test_raises_error_if_not_supported_content_type(self):
|
237
|
+
# Test that truly unsupported content types raise NotImplementedError
|
190
238
|
self.assertRaises(
|
191
|
-
|
192
|
-
lambda:
|
239
|
+
NotImplementedError,
|
240
|
+
lambda: files_to_dataframe(
|
193
241
|
uris=[self.MVP_PATH],
|
194
242
|
content_encoding=ContentEncoding.IDENTITY.value,
|
195
|
-
content_type=ContentType.
|
243
|
+
content_type=ContentType.AVRO.value, # AVRO is actually unsupported
|
196
244
|
ray_init_options={"local_mode": True, "ignore_reinit_error": True},
|
197
245
|
),
|
198
246
|
)
|
199
247
|
|
248
|
+
def test_raises_error_if_not_supported_content_encoding(self):
|
249
|
+
# Test that truly unsupported content encodings raise NotImplementedError
|
250
|
+
self.assertRaises(
|
251
|
+
NotImplementedError,
|
252
|
+
lambda: files_to_dataframe(
|
253
|
+
uris=[self.MVP_PATH],
|
254
|
+
content_encoding=ContentEncoding.ZSTD.value, # ZSTD is actually unsupported
|
255
|
+
content_type=ContentType.PARQUET.value,
|
256
|
+
ray_init_options={"local_mode": True, "ignore_reinit_error": True},
|
257
|
+
),
|
258
|
+
)
|
259
|
+
|
260
|
+
def test_accepts_custom_kwargs(self):
|
261
|
+
# Test that custom kwargs are passed through to daft.read_parquet
|
262
|
+
df = files_to_dataframe(
|
263
|
+
uris=[self.MVP_PATH],
|
264
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
265
|
+
content_type=ContentType.PARQUET.value,
|
266
|
+
ray_init_options={"local_mode": True, "ignore_reinit_error": True},
|
267
|
+
# Custom kwarg that should be passed to daft.read_parquet
|
268
|
+
coerce_int96_timestamp_unit="ns",
|
269
|
+
)
|
270
|
+
|
271
|
+
table = df.to_arrow()
|
272
|
+
self.assertEqual(table.schema.names, ["a", "b"])
|
273
|
+
self.assertEqual(table.num_rows, 100)
|
274
|
+
|
275
|
+
def test_accepts_io_config(self):
|
276
|
+
# Test that io_config parameter is accepted and passed correctly
|
277
|
+
df = files_to_dataframe(
|
278
|
+
uris=[self.MVP_PATH],
|
279
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
280
|
+
content_type=ContentType.PARQUET.value,
|
281
|
+
ray_init_options={"local_mode": True, "ignore_reinit_error": True},
|
282
|
+
# io_config=None should work fine for local files
|
283
|
+
io_config=None,
|
284
|
+
)
|
285
|
+
|
286
|
+
table = df.to_arrow()
|
287
|
+
self.assertEqual(table.schema.names, ["a", "b"])
|
288
|
+
self.assertEqual(table.num_rows, 100)
|
289
|
+
|
200
290
|
|
201
291
|
if __name__ == "__main__":
|
202
292
|
unittest.main()
|