deltacat 1.1.10__py3-none-any.whl → 1.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +1 -1
- deltacat/exceptions.py +5 -2
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +22 -0
- deltacat/tests/utils/test_daft.py +38 -0
- deltacat/tests/utils/test_pyarrow.py +63 -0
- {deltacat-1.1.10.dist-info → deltacat-1.1.12.dist-info}/METADATA +3 -3
- {deltacat-1.1.10.dist-info → deltacat-1.1.12.dist-info}/RECORD +11 -11
- {deltacat-1.1.10.dist-info → deltacat-1.1.12.dist-info}/LICENSE +0 -0
- {deltacat-1.1.10.dist-info → deltacat-1.1.12.dist-info}/WHEEL +0 -0
- {deltacat-1.1.10.dist-info → deltacat-1.1.12.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -162,7 +162,7 @@ def group_by_pk_hash_bucket(
|
|
162
162
|
len(new_tables) == 1
|
163
163
|
), f"Expected only 1 table in the result but found {len(new_tables)}"
|
164
164
|
|
165
|
-
table =
|
165
|
+
table = new_tables[0]
|
166
166
|
|
167
167
|
# group hash bucket record indices
|
168
168
|
result = group_record_indices_by_hash_bucket(
|
deltacat/exceptions.py
CHANGED
@@ -213,11 +213,14 @@ def categorize_errors(func: Callable):
|
|
213
213
|
except BaseException as e:
|
214
214
|
deltacat_storage = None
|
215
215
|
deltacat_storage_kwargs = {}
|
216
|
+
all_args = args
|
216
217
|
if kwargs:
|
217
218
|
deltacat_storage = kwargs.get(DELTACAT_STORAGE_PARAM)
|
218
219
|
deltacat_storage_kwargs = kwargs.get(DELTACAT_STORAGE_KWARGS_PARAM, {})
|
219
|
-
|
220
|
-
|
220
|
+
all_args = all_args + tuple(kwargs.values())
|
221
|
+
|
222
|
+
if not deltacat_storage and all_args:
|
223
|
+
for arg in all_args:
|
221
224
|
if (
|
222
225
|
isinstance(arg, dict)
|
223
226
|
and arg.get(DELTACAT_STORAGE_PARAM) is not None
|
@@ -179,6 +179,28 @@ class TestReadWriteRoundCompletionFile:
|
|
179
179
|
|
180
180
|
assert rcf == expected_rcf_2
|
181
181
|
|
182
|
+
def test_read_when_none_destination_partition_id(self):
|
183
|
+
|
184
|
+
source_locator = get_test_partition_locator("source")
|
185
|
+
destination_locator = get_test_partition_locator(None)
|
186
|
+
|
187
|
+
expected_rcf = RoundCompletionInfo.of(
|
188
|
+
high_watermark=122,
|
189
|
+
compacted_delta_locator={},
|
190
|
+
compacted_pyarrow_write_result={},
|
191
|
+
sort_keys_bit_width=12,
|
192
|
+
)
|
193
|
+
|
194
|
+
write_round_completion_file(
|
195
|
+
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
196
|
+
)
|
197
|
+
|
198
|
+
rcf = read_round_completion_file(
|
199
|
+
RCF_BUCKET_NAME, source_locator, destination_locator
|
200
|
+
)
|
201
|
+
|
202
|
+
assert rcf == expected_rcf
|
203
|
+
|
182
204
|
def test_write_when_custom_url_is_passed(self):
|
183
205
|
"""
|
184
206
|
This test case tests the backward compatibility by successfully
|
@@ -104,6 +104,44 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
104
104
|
self.assertEqual(table.schema.field("MISSING").type, pa.string())
|
105
105
|
self.assertEqual(table.num_rows, 100)
|
106
106
|
|
107
|
+
def test_read_from_s3_single_column_with_schema_extra_cols_column_names(self):
|
108
|
+
schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
|
109
|
+
pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
110
|
+
schema=schema
|
111
|
+
)
|
112
|
+
table = daft_s3_file_to_table(
|
113
|
+
self.MVP_PATH,
|
114
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
115
|
+
content_type=ContentType.PARQUET.value,
|
116
|
+
column_names=["a", "MISSING"],
|
117
|
+
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
118
|
+
)
|
119
|
+
self.assertEqual(
|
120
|
+
table.schema.names, ["a", "MISSING"]
|
121
|
+
) # NOTE: "MISSING" is padded as a null array
|
122
|
+
self.assertEqual(table.schema.field("a").type, pa.int8())
|
123
|
+
self.assertEqual(table.schema.field("MISSING").type, pa.string())
|
124
|
+
self.assertEqual(table.num_rows, 100)
|
125
|
+
|
126
|
+
def test_read_from_s3_single_column_with_schema_only_missing_col(self):
|
127
|
+
schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
|
128
|
+
pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
129
|
+
schema=schema
|
130
|
+
)
|
131
|
+
table = daft_s3_file_to_table(
|
132
|
+
self.MVP_PATH,
|
133
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
134
|
+
content_type=ContentType.PARQUET.value,
|
135
|
+
include_columns=["MISSING"],
|
136
|
+
column_names=["a", "MISSING"],
|
137
|
+
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
138
|
+
)
|
139
|
+
self.assertEqual(
|
140
|
+
table.schema.names, ["MISSING"]
|
141
|
+
) # NOTE: "MISSING" is padded as a null array
|
142
|
+
self.assertEqual(table.schema.field("MISSING").type, pa.string())
|
143
|
+
self.assertEqual(table.num_rows, 0)
|
144
|
+
|
107
145
|
def test_read_from_s3_single_column_with_row_groups(self):
|
108
146
|
|
109
147
|
metadata = pq.read_metadata(self.MVP_PATH)
|
@@ -85,6 +85,43 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
85
85
|
self.assertEqual(result_schema.field(2).type, "int64")
|
86
86
|
self.assertEqual(result_schema.field(2).name, "MISSING")
|
87
87
|
|
88
|
+
def test_s3_partial_parquet_file_to_table_when_schema_missing_columns(self):
|
89
|
+
|
90
|
+
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
91
|
+
partial_parquet_params = PartialParquetParameters.of(
|
92
|
+
pq_metadata=pq_file.metadata
|
93
|
+
)
|
94
|
+
# only first row group to be downloaded
|
95
|
+
partial_parquet_params.row_groups_to_download.pop()
|
96
|
+
|
97
|
+
schema = pa.schema(
|
98
|
+
[
|
99
|
+
pa.field("n_legs", pa.string()),
|
100
|
+
pa.field("animal", pa.string()),
|
101
|
+
# NOTE: This field is not in the parquet file, but will be added on as an all-null column
|
102
|
+
pa.field("MISSING", pa.int64()),
|
103
|
+
]
|
104
|
+
)
|
105
|
+
|
106
|
+
pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
|
107
|
+
|
108
|
+
result = s3_partial_parquet_file_to_table(
|
109
|
+
PARQUET_FILE_PATH,
|
110
|
+
ContentType.PARQUET.value,
|
111
|
+
ContentEncoding.IDENTITY.value,
|
112
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
113
|
+
partial_file_download_params=partial_parquet_params,
|
114
|
+
column_names=["n_legs", "animal", "MISSING"],
|
115
|
+
include_columns=["MISSING"],
|
116
|
+
)
|
117
|
+
|
118
|
+
self.assertEqual(len(result), 0)
|
119
|
+
self.assertEqual(len(result.column_names), 1)
|
120
|
+
|
121
|
+
result_schema = result.schema
|
122
|
+
self.assertEqual(result_schema.field(0).type, "int64")
|
123
|
+
self.assertEqual(result_schema.field(0).name, "MISSING")
|
124
|
+
|
88
125
|
def test_s3_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
|
89
126
|
self,
|
90
127
|
):
|
@@ -234,6 +271,32 @@ class TestReadCSV(TestCase):
|
|
234
271
|
lambda: pyarrow_read_csv(NON_EMPTY_VALID_UTSV_PATH, **kwargs),
|
235
272
|
)
|
236
273
|
|
274
|
+
def test_read_csv_when_excess_columns_included(self):
|
275
|
+
|
276
|
+
schema = pa.schema(
|
277
|
+
[
|
278
|
+
("is_active", pa.string()),
|
279
|
+
("ship_datetime_utc", pa.timestamp("us")),
|
280
|
+
("MISSING", pa.string()),
|
281
|
+
]
|
282
|
+
)
|
283
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
284
|
+
_add_column_kwargs(
|
285
|
+
ContentType.UNESCAPED_TSV.value,
|
286
|
+
["is_active", "ship_datetime_utc", "MISSING"],
|
287
|
+
["is_active", "ship_datetime_utc", "MISSING"],
|
288
|
+
kwargs,
|
289
|
+
)
|
290
|
+
|
291
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
292
|
+
|
293
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
294
|
+
|
295
|
+
self.assertRaises(
|
296
|
+
pa.lib.ArrowInvalid,
|
297
|
+
lambda: pyarrow_read_csv(NON_EMPTY_VALID_UTSV_PATH, **kwargs),
|
298
|
+
)
|
299
|
+
|
237
300
|
def test_read_csv_when_empty_csv_sanity(self):
|
238
301
|
|
239
302
|
schema = pa.schema(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: deltacat
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.12
|
4
4
|
Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
|
5
5
|
Home-page: https://github.com/ray-project/deltacat
|
6
6
|
Author: Ray Team
|
@@ -21,13 +21,13 @@ Requires-Dist: numpy ==1.21.5
|
|
21
21
|
Requires-Dist: pandas ==1.3.5
|
22
22
|
Requires-Dist: pyarrow ==12.0.1
|
23
23
|
Requires-Dist: pydantic ==1.10.4
|
24
|
-
Requires-Dist: ray
|
24
|
+
Requires-Dist: ray >=2.20.0
|
25
25
|
Requires-Dist: s3fs ==2024.5.0
|
26
26
|
Requires-Dist: tenacity ==8.1.0
|
27
27
|
Requires-Dist: typing-extensions ==4.4.0
|
28
28
|
Requires-Dist: pymemcache ==4.0.0
|
29
29
|
Requires-Dist: redis ==4.6.0
|
30
|
-
Requires-Dist: getdaft ==0.2.
|
30
|
+
Requires-Dist: getdaft ==0.2.31
|
31
31
|
Requires-Dist: schedule ==1.2.0
|
32
32
|
|
33
33
|
# DeltaCAT
|
@@ -1,6 +1,6 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=jtrgQAzzZxq5YU_9RwtH0N4p8k4_ACd_vcjuWJf934Q,1778
|
2
2
|
deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
|
3
|
-
deltacat/exceptions.py,sha256=
|
3
|
+
deltacat/exceptions.py,sha256=yWM4RXK7uRrQc1VgJv6Lv2UiNZWAx2wolLq7cBwjlkg,12770
|
4
4
|
deltacat/logs.py,sha256=6g16VkEFidbaMjgenAjggE1r2l664drMVhreRs8B1IQ,8438
|
5
5
|
deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
deltacat/aws/clients.py,sha256=VgddlV3AEjlBGIFmhhHxokYzwJ-lXnmHAeprVyADduI,6948
|
@@ -74,7 +74,7 @@ deltacat/compute/compactor_v2/utils/dedupe.py,sha256=62tFCY2iRP7I3-45GCIYs6_SJsQ
|
|
74
74
|
deltacat/compute/compactor_v2/utils/delta.py,sha256=8hjkDeIIkSX-gAQ2utQSp2sZcO2tWZHMTxpFusZwBHw,3635
|
75
75
|
deltacat/compute/compactor_v2/utils/io.py,sha256=autXlE3uHICdCCuJoS7mfdeJbRRiz2_xlz-3izlccB4,5264
|
76
76
|
deltacat/compute/compactor_v2/utils/merge.py,sha256=7UHxm71iJ1dgRoz8v73CqoeylNzO36t90OJsVVBDFxk,5312
|
77
|
-
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=
|
77
|
+
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=ghyIifjXtqXgi8lN3lfnVQ2vi8uk_ny0FE7hsQlLjRQ,11538
|
78
78
|
deltacat/compute/compactor_v2/utils/task_options.py,sha256=XFvZ_8mCq3cDnFlopFG84IahcYEddilZDmU1PkKq-zg,14067
|
79
79
|
deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
|
80
80
|
deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
|
@@ -158,7 +158,7 @@ deltacat/tests/compute/compactor/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
158
158
|
deltacat/tests/compute/compactor/steps/test_repartition.py,sha256=0uRguPEKeLSYs746Jv8io-HZMWdyXNcOMBu8GO2mA0M,9305
|
159
159
|
deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
160
160
|
deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
|
161
|
-
deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=
|
161
|
+
deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
|
162
162
|
deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
163
163
|
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=2wIXQW0Jm_FtWB5EviUR6Uk2ddVCJKs-CYGKE1xSPu4,9617
|
164
164
|
deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
|
@@ -182,10 +182,10 @@ deltacat/tests/test_utils/storage.py,sha256=93GEn4A5WbMHWk0Ec4Bd7RxeHoSEnBfSarfW
|
|
182
182
|
deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
|
183
183
|
deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
184
184
|
deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iyzjiJ7-Rcl0o,1506
|
185
|
-
deltacat/tests/utils/test_daft.py,sha256=
|
185
|
+
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
186
186
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
187
187
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
188
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
188
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=AWx0DTsBA1PkQag48w_HeQdz7tlBzJsm9v7Nd6-dhEY,19607
|
189
189
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
190
190
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
191
191
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -216,8 +216,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
216
216
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
217
217
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
218
218
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
219
|
-
deltacat-1.1.
|
220
|
-
deltacat-1.1.
|
221
|
-
deltacat-1.1.
|
222
|
-
deltacat-1.1.
|
223
|
-
deltacat-1.1.
|
219
|
+
deltacat-1.1.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
220
|
+
deltacat-1.1.12.dist-info/METADATA,sha256=fqP5NYc_4wrfWcsQXO1Aht8xM22LrmDZLHfMAqq2opQ,1748
|
221
|
+
deltacat-1.1.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
222
|
+
deltacat-1.1.12.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
223
|
+
deltacat-1.1.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|