deltacat 1.1.10__py3-none-any.whl → 1.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.10"
47
+ __version__ = "1.1.12"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -162,7 +162,7 @@ def group_by_pk_hash_bucket(
162
162
  len(new_tables) == 1
163
163
  ), f"Expected only 1 table in the result but found {len(new_tables)}"
164
164
 
165
- table = generate_pk_hash_column([table], primary_keys, requires_hash=True)[0]
165
+ table = new_tables[0]
166
166
 
167
167
  # group hash bucket record indices
168
168
  result = group_record_indices_by_hash_bucket(
deltacat/exceptions.py CHANGED
@@ -213,11 +213,14 @@ def categorize_errors(func: Callable):
213
213
  except BaseException as e:
214
214
  deltacat_storage = None
215
215
  deltacat_storage_kwargs = {}
216
+ all_args = args
216
217
  if kwargs:
217
218
  deltacat_storage = kwargs.get(DELTACAT_STORAGE_PARAM)
218
219
  deltacat_storage_kwargs = kwargs.get(DELTACAT_STORAGE_KWARGS_PARAM, {})
219
- if not deltacat_storage and args:
220
- for arg in args:
220
+ all_args = all_args + tuple(kwargs.values())
221
+
222
+ if not deltacat_storage and all_args:
223
+ for arg in all_args:
221
224
  if (
222
225
  isinstance(arg, dict)
223
226
  and arg.get(DELTACAT_STORAGE_PARAM) is not None
@@ -179,6 +179,28 @@ class TestReadWriteRoundCompletionFile:
179
179
 
180
180
  assert rcf == expected_rcf_2
181
181
 
182
+ def test_read_when_none_destination_partition_id(self):
183
+
184
+ source_locator = get_test_partition_locator("source")
185
+ destination_locator = get_test_partition_locator(None)
186
+
187
+ expected_rcf = RoundCompletionInfo.of(
188
+ high_watermark=122,
189
+ compacted_delta_locator={},
190
+ compacted_pyarrow_write_result={},
191
+ sort_keys_bit_width=12,
192
+ )
193
+
194
+ write_round_completion_file(
195
+ RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
196
+ )
197
+
198
+ rcf = read_round_completion_file(
199
+ RCF_BUCKET_NAME, source_locator, destination_locator
200
+ )
201
+
202
+ assert rcf == expected_rcf
203
+
182
204
  def test_write_when_custom_url_is_passed(self):
183
205
  """
184
206
  This test case tests the backward compatibility by successfully
@@ -104,6 +104,44 @@ class TestDaftS3FileToTable(unittest.TestCase):
104
104
  self.assertEqual(table.schema.field("MISSING").type, pa.string())
105
105
  self.assertEqual(table.num_rows, 100)
106
106
 
107
+ def test_read_from_s3_single_column_with_schema_extra_cols_column_names(self):
108
+ schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
109
+ pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
110
+ schema=schema
111
+ )
112
+ table = daft_s3_file_to_table(
113
+ self.MVP_PATH,
114
+ content_encoding=ContentEncoding.IDENTITY.value,
115
+ content_type=ContentType.PARQUET.value,
116
+ column_names=["a", "MISSING"],
117
+ pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
118
+ )
119
+ self.assertEqual(
120
+ table.schema.names, ["a", "MISSING"]
121
+ ) # NOTE: "MISSING" is padded as a null array
122
+ self.assertEqual(table.schema.field("a").type, pa.int8())
123
+ self.assertEqual(table.schema.field("MISSING").type, pa.string())
124
+ self.assertEqual(table.num_rows, 100)
125
+
126
+ def test_read_from_s3_single_column_with_schema_only_missing_col(self):
127
+ schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
128
+ pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
129
+ schema=schema
130
+ )
131
+ table = daft_s3_file_to_table(
132
+ self.MVP_PATH,
133
+ content_encoding=ContentEncoding.IDENTITY.value,
134
+ content_type=ContentType.PARQUET.value,
135
+ include_columns=["MISSING"],
136
+ column_names=["a", "MISSING"],
137
+ pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
138
+ )
139
+ self.assertEqual(
140
+ table.schema.names, ["MISSING"]
141
+ ) # NOTE: "MISSING" is padded as a null array
142
+ self.assertEqual(table.schema.field("MISSING").type, pa.string())
143
+ self.assertEqual(table.num_rows, 0)
144
+
107
145
  def test_read_from_s3_single_column_with_row_groups(self):
108
146
 
109
147
  metadata = pq.read_metadata(self.MVP_PATH)
@@ -85,6 +85,43 @@ class TestS3PartialParquetFileToTable(TestCase):
85
85
  self.assertEqual(result_schema.field(2).type, "int64")
86
86
  self.assertEqual(result_schema.field(2).name, "MISSING")
87
87
 
88
+ def test_s3_partial_parquet_file_to_table_when_schema_missing_columns(self):
89
+
90
+ pq_file = ParquetFile(PARQUET_FILE_PATH)
91
+ partial_parquet_params = PartialParquetParameters.of(
92
+ pq_metadata=pq_file.metadata
93
+ )
94
+ # only first row group to be downloaded
95
+ partial_parquet_params.row_groups_to_download.pop()
96
+
97
+ schema = pa.schema(
98
+ [
99
+ pa.field("n_legs", pa.string()),
100
+ pa.field("animal", pa.string()),
101
+ # NOTE: This field is not in the parquet file, but will be added on as an all-null column
102
+ pa.field("MISSING", pa.int64()),
103
+ ]
104
+ )
105
+
106
+ pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
107
+
108
+ result = s3_partial_parquet_file_to_table(
109
+ PARQUET_FILE_PATH,
110
+ ContentType.PARQUET.value,
111
+ ContentEncoding.IDENTITY.value,
112
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
113
+ partial_file_download_params=partial_parquet_params,
114
+ column_names=["n_legs", "animal", "MISSING"],
115
+ include_columns=["MISSING"],
116
+ )
117
+
118
+ self.assertEqual(len(result), 0)
119
+ self.assertEqual(len(result.column_names), 1)
120
+
121
+ result_schema = result.schema
122
+ self.assertEqual(result_schema.field(0).type, "int64")
123
+ self.assertEqual(result_schema.field(0).name, "MISSING")
124
+
88
125
  def test_s3_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
89
126
  self,
90
127
  ):
@@ -234,6 +271,32 @@ class TestReadCSV(TestCase):
234
271
  lambda: pyarrow_read_csv(NON_EMPTY_VALID_UTSV_PATH, **kwargs),
235
272
  )
236
273
 
274
+ def test_read_csv_when_excess_columns_included(self):
275
+
276
+ schema = pa.schema(
277
+ [
278
+ ("is_active", pa.string()),
279
+ ("ship_datetime_utc", pa.timestamp("us")),
280
+ ("MISSING", pa.string()),
281
+ ]
282
+ )
283
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
284
+ _add_column_kwargs(
285
+ ContentType.UNESCAPED_TSV.value,
286
+ ["is_active", "ship_datetime_utc", "MISSING"],
287
+ ["is_active", "ship_datetime_utc", "MISSING"],
288
+ kwargs,
289
+ )
290
+
291
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
292
+
293
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
294
+
295
+ self.assertRaises(
296
+ pa.lib.ArrowInvalid,
297
+ lambda: pyarrow_read_csv(NON_EMPTY_VALID_UTSV_PATH, **kwargs),
298
+ )
299
+
237
300
  def test_read_csv_when_empty_csv_sanity(self):
238
301
 
239
302
  schema = pa.schema(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.10
3
+ Version: 1.1.12
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -21,13 +21,13 @@ Requires-Dist: numpy ==1.21.5
21
21
  Requires-Dist: pandas ==1.3.5
22
22
  Requires-Dist: pyarrow ==12.0.1
23
23
  Requires-Dist: pydantic ==1.10.4
24
- Requires-Dist: ray[default] >=2.20.0
24
+ Requires-Dist: ray >=2.20.0
25
25
  Requires-Dist: s3fs ==2024.5.0
26
26
  Requires-Dist: tenacity ==8.1.0
27
27
  Requires-Dist: typing-extensions ==4.4.0
28
28
  Requires-Dist: pymemcache ==4.0.0
29
29
  Requires-Dist: redis ==4.6.0
30
- Requires-Dist: getdaft ==0.2.27
30
+ Requires-Dist: getdaft ==0.2.31
31
31
  Requires-Dist: schedule ==1.2.0
32
32
 
33
33
  # DeltaCAT
@@ -1,6 +1,6 @@
1
- deltacat/__init__.py,sha256=3zwJ4F2gVe9lTHU4b_d7KJ42x8OpAVxhXw4jG-T1TQk,1778
1
+ deltacat/__init__.py,sha256=jtrgQAzzZxq5YU_9RwtH0N4p8k4_ACd_vcjuWJf934Q,1778
2
2
  deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
3
- deltacat/exceptions.py,sha256=q9HVyYLZc98c9TofAuD4SWCxPqV8F6F9gpczBUNCJWo,12672
3
+ deltacat/exceptions.py,sha256=yWM4RXK7uRrQc1VgJv6Lv2UiNZWAx2wolLq7cBwjlkg,12770
4
4
  deltacat/logs.py,sha256=6g16VkEFidbaMjgenAjggE1r2l664drMVhreRs8B1IQ,8438
5
5
  deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  deltacat/aws/clients.py,sha256=VgddlV3AEjlBGIFmhhHxokYzwJ-lXnmHAeprVyADduI,6948
@@ -74,7 +74,7 @@ deltacat/compute/compactor_v2/utils/dedupe.py,sha256=62tFCY2iRP7I3-45GCIYs6_SJsQ
74
74
  deltacat/compute/compactor_v2/utils/delta.py,sha256=8hjkDeIIkSX-gAQ2utQSp2sZcO2tWZHMTxpFusZwBHw,3635
75
75
  deltacat/compute/compactor_v2/utils/io.py,sha256=autXlE3uHICdCCuJoS7mfdeJbRRiz2_xlz-3izlccB4,5264
76
76
  deltacat/compute/compactor_v2/utils/merge.py,sha256=7UHxm71iJ1dgRoz8v73CqoeylNzO36t90OJsVVBDFxk,5312
77
- deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=qE1s65HWlcWmvYyXAZm7R1h88M2Min9gp4rUgpfS3-A,11594
77
+ deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=ghyIifjXtqXgi8lN3lfnVQ2vi8uk_ny0FE7hsQlLjRQ,11538
78
78
  deltacat/compute/compactor_v2/utils/task_options.py,sha256=XFvZ_8mCq3cDnFlopFG84IahcYEddilZDmU1PkKq-zg,14067
79
79
  deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
80
80
  deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
@@ -158,7 +158,7 @@ deltacat/tests/compute/compactor/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
158
158
  deltacat/tests/compute/compactor/steps/test_repartition.py,sha256=0uRguPEKeLSYs746Jv8io-HZMWdyXNcOMBu8GO2mA0M,9305
159
159
  deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
160
160
  deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
161
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=9c-EnE5zpToFKJl2djtEEDErUZVIhbEJXkg3hVdR_ps,6758
161
+ deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
162
162
  deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
163
163
  deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=2wIXQW0Jm_FtWB5EviUR6Uk2ddVCJKs-CYGKE1xSPu4,9617
164
164
  deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
@@ -182,10 +182,10 @@ deltacat/tests/test_utils/storage.py,sha256=93GEn4A5WbMHWk0Ec4Bd7RxeHoSEnBfSarfW
182
182
  deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
183
183
  deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
184
184
  deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iyzjiJ7-Rcl0o,1506
185
- deltacat/tests/utils/test_daft.py,sha256=AIE0qz6oKhEEvBqF0VfQ5pwTtiTHqyf0EuUXduiS3t4,6487
185
+ deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
186
186
  deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
187
187
  deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
188
- deltacat/tests/utils/test_pyarrow.py,sha256=eZAuYp9MUf8lmpIilH57JkURuNsTGZ3IAGC4Gm5hdrM,17307
188
+ deltacat/tests/utils/test_pyarrow.py,sha256=AWx0DTsBA1PkQag48w_HeQdz7tlBzJsm9v7Nd6-dhEY,19607
189
189
  deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
190
190
  deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
191
191
  deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -216,8 +216,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
216
216
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
217
217
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
218
218
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
219
- deltacat-1.1.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
220
- deltacat-1.1.10.dist-info/METADATA,sha256=nh4dJ-kcwCy3ZNso0NtMlAZcABxtLZtp8CYbZpz_x00,1757
221
- deltacat-1.1.10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
222
- deltacat-1.1.10.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
223
- deltacat-1.1.10.dist-info/RECORD,,
219
+ deltacat-1.1.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
220
+ deltacat-1.1.12.dist-info/METADATA,sha256=fqP5NYc_4wrfWcsQXO1Aht8xM22LrmDZLHfMAqq2opQ,1748
221
+ deltacat-1.1.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
222
+ deltacat-1.1.12.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
223
+ deltacat-1.1.12.dist-info/RECORD,,