deltacat 1.1.11__py3-none-any.whl → 1.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/tests/utils/test_daft.py +38 -0
- deltacat/tests/utils/test_pyarrow.py +63 -0
- {deltacat-1.1.11.dist-info → deltacat-1.1.12.dist-info}/METADATA +3 -3
- {deltacat-1.1.11.dist-info → deltacat-1.1.12.dist-info}/RECORD +8 -8
- {deltacat-1.1.11.dist-info → deltacat-1.1.12.dist-info}/LICENSE +0 -0
- {deltacat-1.1.11.dist-info → deltacat-1.1.12.dist-info}/WHEEL +0 -0
- {deltacat-1.1.11.dist-info → deltacat-1.1.12.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -104,6 +104,44 @@ class TestDaftS3FileToTable(unittest.TestCase):
|
|
104
104
|
self.assertEqual(table.schema.field("MISSING").type, pa.string())
|
105
105
|
self.assertEqual(table.num_rows, 100)
|
106
106
|
|
107
|
+
def test_read_from_s3_single_column_with_schema_extra_cols_column_names(self):
|
108
|
+
schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
|
109
|
+
pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
110
|
+
schema=schema
|
111
|
+
)
|
112
|
+
table = daft_s3_file_to_table(
|
113
|
+
self.MVP_PATH,
|
114
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
115
|
+
content_type=ContentType.PARQUET.value,
|
116
|
+
column_names=["a", "MISSING"],
|
117
|
+
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
118
|
+
)
|
119
|
+
self.assertEqual(
|
120
|
+
table.schema.names, ["a", "MISSING"]
|
121
|
+
) # NOTE: "MISSING" is padded as a null array
|
122
|
+
self.assertEqual(table.schema.field("a").type, pa.int8())
|
123
|
+
self.assertEqual(table.schema.field("MISSING").type, pa.string())
|
124
|
+
self.assertEqual(table.num_rows, 100)
|
125
|
+
|
126
|
+
def test_read_from_s3_single_column_with_schema_only_missing_col(self):
|
127
|
+
schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
|
128
|
+
pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
129
|
+
schema=schema
|
130
|
+
)
|
131
|
+
table = daft_s3_file_to_table(
|
132
|
+
self.MVP_PATH,
|
133
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
134
|
+
content_type=ContentType.PARQUET.value,
|
135
|
+
include_columns=["MISSING"],
|
136
|
+
column_names=["a", "MISSING"],
|
137
|
+
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
138
|
+
)
|
139
|
+
self.assertEqual(
|
140
|
+
table.schema.names, ["MISSING"]
|
141
|
+
) # NOTE: "MISSING" is padded as a null array
|
142
|
+
self.assertEqual(table.schema.field("MISSING").type, pa.string())
|
143
|
+
self.assertEqual(table.num_rows, 0)
|
144
|
+
|
107
145
|
def test_read_from_s3_single_column_with_row_groups(self):
|
108
146
|
|
109
147
|
metadata = pq.read_metadata(self.MVP_PATH)
|
@@ -85,6 +85,43 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
85
85
|
self.assertEqual(result_schema.field(2).type, "int64")
|
86
86
|
self.assertEqual(result_schema.field(2).name, "MISSING")
|
87
87
|
|
88
|
+
def test_s3_partial_parquet_file_to_table_when_schema_missing_columns(self):
|
89
|
+
|
90
|
+
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
91
|
+
partial_parquet_params = PartialParquetParameters.of(
|
92
|
+
pq_metadata=pq_file.metadata
|
93
|
+
)
|
94
|
+
# only first row group to be downloaded
|
95
|
+
partial_parquet_params.row_groups_to_download.pop()
|
96
|
+
|
97
|
+
schema = pa.schema(
|
98
|
+
[
|
99
|
+
pa.field("n_legs", pa.string()),
|
100
|
+
pa.field("animal", pa.string()),
|
101
|
+
# NOTE: This field is not in the parquet file, but will be added on as an all-null column
|
102
|
+
pa.field("MISSING", pa.int64()),
|
103
|
+
]
|
104
|
+
)
|
105
|
+
|
106
|
+
pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
|
107
|
+
|
108
|
+
result = s3_partial_parquet_file_to_table(
|
109
|
+
PARQUET_FILE_PATH,
|
110
|
+
ContentType.PARQUET.value,
|
111
|
+
ContentEncoding.IDENTITY.value,
|
112
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
113
|
+
partial_file_download_params=partial_parquet_params,
|
114
|
+
column_names=["n_legs", "animal", "MISSING"],
|
115
|
+
include_columns=["MISSING"],
|
116
|
+
)
|
117
|
+
|
118
|
+
self.assertEqual(len(result), 0)
|
119
|
+
self.assertEqual(len(result.column_names), 1)
|
120
|
+
|
121
|
+
result_schema = result.schema
|
122
|
+
self.assertEqual(result_schema.field(0).type, "int64")
|
123
|
+
self.assertEqual(result_schema.field(0).name, "MISSING")
|
124
|
+
|
88
125
|
def test_s3_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
|
89
126
|
self,
|
90
127
|
):
|
@@ -234,6 +271,32 @@ class TestReadCSV(TestCase):
|
|
234
271
|
lambda: pyarrow_read_csv(NON_EMPTY_VALID_UTSV_PATH, **kwargs),
|
235
272
|
)
|
236
273
|
|
274
|
+
def test_read_csv_when_excess_columns_included(self):
|
275
|
+
|
276
|
+
schema = pa.schema(
|
277
|
+
[
|
278
|
+
("is_active", pa.string()),
|
279
|
+
("ship_datetime_utc", pa.timestamp("us")),
|
280
|
+
("MISSING", pa.string()),
|
281
|
+
]
|
282
|
+
)
|
283
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
284
|
+
_add_column_kwargs(
|
285
|
+
ContentType.UNESCAPED_TSV.value,
|
286
|
+
["is_active", "ship_datetime_utc", "MISSING"],
|
287
|
+
["is_active", "ship_datetime_utc", "MISSING"],
|
288
|
+
kwargs,
|
289
|
+
)
|
290
|
+
|
291
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
292
|
+
|
293
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
294
|
+
|
295
|
+
self.assertRaises(
|
296
|
+
pa.lib.ArrowInvalid,
|
297
|
+
lambda: pyarrow_read_csv(NON_EMPTY_VALID_UTSV_PATH, **kwargs),
|
298
|
+
)
|
299
|
+
|
237
300
|
def test_read_csv_when_empty_csv_sanity(self):
|
238
301
|
|
239
302
|
schema = pa.schema(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: deltacat
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.12
|
4
4
|
Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
|
5
5
|
Home-page: https://github.com/ray-project/deltacat
|
6
6
|
Author: Ray Team
|
@@ -21,13 +21,13 @@ Requires-Dist: numpy ==1.21.5
|
|
21
21
|
Requires-Dist: pandas ==1.3.5
|
22
22
|
Requires-Dist: pyarrow ==12.0.1
|
23
23
|
Requires-Dist: pydantic ==1.10.4
|
24
|
-
Requires-Dist: ray
|
24
|
+
Requires-Dist: ray >=2.20.0
|
25
25
|
Requires-Dist: s3fs ==2024.5.0
|
26
26
|
Requires-Dist: tenacity ==8.1.0
|
27
27
|
Requires-Dist: typing-extensions ==4.4.0
|
28
28
|
Requires-Dist: pymemcache ==4.0.0
|
29
29
|
Requires-Dist: redis ==4.6.0
|
30
|
-
Requires-Dist: getdaft ==0.2.
|
30
|
+
Requires-Dist: getdaft ==0.2.31
|
31
31
|
Requires-Dist: schedule ==1.2.0
|
32
32
|
|
33
33
|
# DeltaCAT
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=jtrgQAzzZxq5YU_9RwtH0N4p8k4_ACd_vcjuWJf934Q,1778
|
2
2
|
deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
|
3
3
|
deltacat/exceptions.py,sha256=yWM4RXK7uRrQc1VgJv6Lv2UiNZWAx2wolLq7cBwjlkg,12770
|
4
4
|
deltacat/logs.py,sha256=6g16VkEFidbaMjgenAjggE1r2l664drMVhreRs8B1IQ,8438
|
@@ -182,10 +182,10 @@ deltacat/tests/test_utils/storage.py,sha256=93GEn4A5WbMHWk0Ec4Bd7RxeHoSEnBfSarfW
|
|
182
182
|
deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
|
183
183
|
deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
184
184
|
deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iyzjiJ7-Rcl0o,1506
|
185
|
-
deltacat/tests/utils/test_daft.py,sha256=
|
185
|
+
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
186
186
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
187
187
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
188
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
188
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=AWx0DTsBA1PkQag48w_HeQdz7tlBzJsm9v7Nd6-dhEY,19607
|
189
189
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
190
190
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
191
191
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -216,8 +216,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
216
216
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
217
217
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
218
218
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
219
|
-
deltacat-1.1.
|
220
|
-
deltacat-1.1.
|
221
|
-
deltacat-1.1.
|
222
|
-
deltacat-1.1.
|
223
|
-
deltacat-1.1.
|
219
|
+
deltacat-1.1.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
220
|
+
deltacat-1.1.12.dist-info/METADATA,sha256=fqP5NYc_4wrfWcsQXO1Aht8xM22LrmDZLHfMAqq2opQ,1748
|
221
|
+
deltacat-1.1.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
222
|
+
deltacat-1.1.12.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
223
|
+
deltacat-1.1.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|