deltacat 0.1.18b15__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/model/compact_partition_params.py +11 -1
  3. deltacat/compute/compactor/model/compaction_session_audit_info.py +13 -0
  4. deltacat/compute/compactor/model/delta_annotated.py +10 -6
  5. deltacat/compute/compactor/repartition_session.py +2 -0
  6. deltacat/compute/compactor/steps/repartition.py +6 -0
  7. deltacat/compute/compactor_v2/compaction_session.py +72 -69
  8. deltacat/compute/compactor_v2/constants.py +3 -0
  9. deltacat/compute/compactor_v2/model/merge_input.py +17 -1
  10. deltacat/compute/compactor_v2/steps/merge.py +430 -2
  11. deltacat/compute/compactor_v2/utils/content_type_params.py +43 -14
  12. deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
  13. deltacat/compute/compactor_v2/utils/io.py +11 -8
  14. deltacat/compute/compactor_v2/utils/primary_key_index.py +58 -25
  15. deltacat/compute/compactor_v2/utils/task_options.py +8 -15
  16. deltacat/tests/compute/common.py +1 -1
  17. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -0
  18. deltacat/tests/compute/test_compaction_session_incremental.py +16 -1
  19. deltacat/tests/compute/testcases.py +7 -2
  20. deltacat/tests/test_utils/pyarrow.py +23 -6
  21. deltacat/types/partial_download.py +1 -0
  22. deltacat/types/tables.py +5 -0
  23. deltacat/utils/arguments.py +1 -2
  24. deltacat/utils/pyarrow.py +5 -0
  25. {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +1 -1
  26. {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +29 -30
  27. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  28. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +0 -199
  29. {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
  30. {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +0 -0
  31. {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -31,8 +31,19 @@ def _append_sha1_hash_to_table(table: pa.Table, hash_column: pa.Array) -> pa.Tab
31
31
  return sc.append_pk_hash_string_column(table, result)
32
32
 
33
33
 
34
- def _is_sha1_desired(hash_column: pa.Array) -> bool:
35
- return hash_column.nbytes > TOTAL_BYTES_IN_SHA1_HASH * len(hash_column)
34
+ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
35
+ total_size = 0
36
+ total_len = 0
37
+
38
+ for hash_column in hash_columns:
39
+ total_size += hash_column.nbytes
40
+ total_len += len(hash_column)
41
+
42
+ logger.info(
43
+ f"Found total length of hash column={total_len} and total_size={total_size}"
44
+ )
45
+
46
+ return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
36
47
 
37
48
 
38
49
  def _append_table_by_hash_bucket(
@@ -61,7 +72,9 @@ def _append_table_by_hash_bucket(
61
72
  for i, group_count in enumerate(group_count_array):
62
73
  hb_idx = hb_group_array[i].as_py()
63
74
  pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count.as_py())
64
- pyarrow_table = pyarrow_table.drop([sc._HASH_BUCKET_IDX_COLUMN_NAME])
75
+ pyarrow_table = pyarrow_table.drop(
76
+ [sc._HASH_BUCKET_IDX_COLUMN_NAME, sc._PK_HASH_STRING_COLUMN_NAME]
77
+ )
65
78
  if hash_bucket_to_table[hb_idx] is None:
66
79
  hash_bucket_to_table[hb_idx] = []
67
80
  hash_bucket_to_table[hb_idx].append(pyarrow_table)
@@ -142,7 +155,7 @@ def _optimized_group_record_batches_by_hash_bucket(
142
155
  def group_by_pk_hash_bucket(
143
156
  table: pa.Table, num_buckets: int, primary_keys: List[str]
144
157
  ) -> np.ndarray:
145
- table = generate_pk_hash_column(table, primary_keys, requires_sha1=True)
158
+ table = generate_pk_hash_column([table], primary_keys, requires_sha1=True)[0]
146
159
 
147
160
  # group hash bucket record indices
148
161
  result = group_record_indices_by_hash_bucket(
@@ -154,53 +167,73 @@ def group_by_pk_hash_bucket(
154
167
 
155
168
 
156
169
  def generate_pk_hash_column(
157
- table: pa.Table,
170
+ tables: List[pa.Table],
158
171
  primary_keys: Optional[List[str]] = None,
159
172
  requires_sha1: bool = False,
160
- ) -> pa.Table:
173
+ ) -> List[pa.Table]:
161
174
  """
162
- Returns a new table after generating the primary key hash if desired.
175
+ Returns a new table list after generating the primary key hash if desired.
163
176
 
164
177
  1. If there are no primary keys, each hash will be unique uuid/sha1 hex
165
- 2. If there are more than 0 primary keys, returns a table with new columns appended.
178
+ 2. If there are more than 0 primary keys, returns a table with pk hash column appended.
166
179
  """
167
180
 
168
- start = time.monotonic()
169
-
170
- can_sha1 = False
171
- if primary_keys:
181
+ def _generate_pk_hash(table: pa.Table) -> pa.Array:
172
182
  pk_columns = []
173
183
  for pk_name in primary_keys:
174
184
  pk_columns.append(pc.cast(table[pk_name], pa.string()))
175
185
 
176
186
  pk_columns.append(PK_DELIMITER)
177
187
  hash_column = pc.binary_join_element_wise(*pk_columns)
188
+ return hash_column
178
189
 
179
- can_sha1 = requires_sha1 or _is_sha1_desired(hash_column)
180
- else:
190
+ def _generate_uuid(table: pa.Table) -> pa.Array:
181
191
  hash_column = pa.array(
182
192
  [uuid.uuid4().hex for _ in range(len(table))], pa.string()
183
193
  )
194
+ return hash_column
195
+
196
+ start = time.monotonic()
197
+
198
+ hash_column_list = []
199
+
200
+ can_sha1 = False
201
+ if primary_keys:
202
+ hash_column_list = [_generate_pk_hash(table) for table in tables]
203
+
204
+ can_sha1 = requires_sha1 or _is_sha1_desired(hash_column_list)
205
+ else:
206
+ hash_column_list = [_generate_uuid(table) for table in tables]
184
207
 
185
208
  logger.info(
186
- f"can_generate_sha1={can_sha1} for the table with hash column size"
187
- f"={hash_column.nbytes} bytes, num_rows={len(hash_column)}, "
188
- f"and requires_sha1={requires_sha1}"
209
+ f"can_generate_sha1={can_sha1} for the table and requires_sha1={requires_sha1}"
189
210
  )
190
211
 
191
- if can_sha1:
192
- table = _append_sha1_hash_to_table(table, hash_column)
193
- else:
194
- table = table.append_column(sc._PK_HASH_STRING_COLUMN_FIELD, hash_column)
212
+ result = []
213
+
214
+ total_len = 0
215
+ total_size = 0
216
+ for index, table in enumerate(tables):
217
+ if can_sha1:
218
+ table = _append_sha1_hash_to_table(table, hash_column_list[index])
219
+ else:
220
+ table = table.append_column(
221
+ sc._PK_HASH_STRING_COLUMN_FIELD, hash_column_list[index]
222
+ )
223
+
224
+ total_len += len(table)
225
+ total_size += hash_column_list[index].nbytes
226
+
227
+ result.append(table)
195
228
 
196
229
  end = time.monotonic()
197
230
 
198
231
  logger.info(
199
- f"Took {end - start}s to generate pk hash of len: {len(hash_column)}"
200
- f" and size: {hash_column.nbytes} bytes"
232
+ f"Took {end - start}s to generate pk hash of len: {total_len}"
233
+ f" for size: {total_size} bytes"
201
234
  )
202
235
 
203
- return table
236
+ return result
204
237
 
205
238
 
206
239
  def group_record_indices_by_hash_bucket(
@@ -298,7 +331,7 @@ def hash_group_index_to_hash_bucket_indices(
298
331
  if hb_group > num_buckets:
299
332
  return []
300
333
 
301
- return range(hb_group, num_groups, num_buckets)
334
+ return range(hb_group, num_buckets, num_groups)
302
335
 
303
336
 
304
337
  def pk_digest_to_hash_bucket_index(digest: str, num_buckets: int) -> int:
@@ -2,7 +2,7 @@ from typing import Dict, Optional, List, Tuple
2
2
  from deltacat.types.media import ContentEncoding, ContentType
3
3
  from deltacat.types.partial_download import PartialParquetParameters
4
4
  from deltacat.storage import (
5
- Delta,
5
+ Manifest,
6
6
  ManifestEntry,
7
7
  interface as unimplemented_deltacat_storage,
8
8
  )
@@ -11,9 +11,6 @@ from deltacat.compute.compactor.model.round_completion_info import RoundCompleti
11
11
  from deltacat.compute.compactor_v2.utils.primary_key_index import (
12
12
  hash_group_index_to_hash_bucket_indices,
13
13
  )
14
- from deltacat.compute.compactor_v2.utils.content_type_params import (
15
- append_content_type_params,
16
- )
17
14
  from deltacat.compute.compactor_v2.constants import TOTAL_MEMORY_BUFFER_PERCENTAGE
18
15
 
19
16
 
@@ -24,6 +21,7 @@ def _get_parquet_type_params_if_exist(
24
21
  entry.meta
25
22
  and entry.meta.content_type == ContentType.PARQUET
26
23
  and entry.meta.content_encoding == ContentEncoding.IDENTITY
24
+ and entry.meta.content_type_parameters
27
25
  ):
28
26
  for type_params in entry.meta.content_type_parameters:
29
27
  if isinstance(type_params, PartialParquetParameters):
@@ -93,7 +91,7 @@ def estimate_manifest_entry_column_size_bytes(
93
91
 
94
92
  type_params = _get_parquet_type_params_if_exist(entry=entry)
95
93
 
96
- if type_params.pq_metadata:
94
+ if type_params and type_params.pq_metadata:
97
95
  return _calculate_parquet_column_size(type_params=type_params, columns=columns)
98
96
 
99
97
  return None
@@ -153,7 +151,7 @@ def merge_resource_options_provider(
153
151
  hash_group_size_bytes: Dict[int, int],
154
152
  hash_group_num_rows: Dict[int, int],
155
153
  round_completion_info: Optional[RoundCompletionInfo] = None,
156
- compacted_delta: Optional[Delta] = None,
154
+ compacted_delta_manifest: Optional[Manifest] = None,
157
155
  primary_keys: Optional[List[str]] = None,
158
156
  deltacat_storage=unimplemented_deltacat_storage,
159
157
  deltacat_storage_kwargs: Optional[Dict] = {},
@@ -168,8 +166,8 @@ def merge_resource_options_provider(
168
166
 
169
167
  if (
170
168
  round_completion_info
171
- and compacted_delta
172
- and round_completion_info.hb_index_to_entry_range_both_inclusive
169
+ and compacted_delta_manifest
170
+ and round_completion_info.hb_index_to_entry_range
173
171
  ):
174
172
 
175
173
  previous_inflation = (
@@ -187,15 +185,10 @@ def merge_resource_options_provider(
187
185
 
188
186
  for hb_idx in iterable:
189
187
  entry_start, entry_end = round_completion_info.hb_index_to_entry_range[
190
- hb_idx
188
+ str(hb_idx)
191
189
  ]
192
190
  for entry_index in range(entry_start, entry_end):
193
- entry = append_content_type_params(
194
- compacted_delta,
195
- entry_index=entry_index,
196
- deltacat_storage=deltacat_storage,
197
- deltacat_storage_kwargs=deltacat_storage_kwargs,
198
- )
191
+ entry = compacted_delta_manifest.entries[entry_index]
199
192
 
200
193
  current_entry_size = estimate_manifest_entry_size_bytes(
201
194
  entry=entry, previous_inflation=previous_inflation
@@ -15,7 +15,7 @@ BASE_TEST_DESTINATION_NAMESPACE = "destination_test_namespace"
15
15
  BASE_TEST_DESTINATION_TABLE_NAME = "destination_test_table_RAY"
16
16
  BASE_TEST_DESTINATION_TABLE_VERSION = "1"
17
17
 
18
- HASH_BUCKET_COUNT: int = 1
18
+ HASH_BUCKET_COUNT: int = 3
19
19
 
20
20
  MAX_RECORDS_PER_FILE: int = 1
21
21
 
@@ -49,6 +49,7 @@ class TestRepartitionRange(unittest.TestCase):
49
49
  self.destination_partition: PartitionLocator = MagicMock()
50
50
  self.repartition_args = {"column": "last_updated", "ranges": [1678665487112747]}
51
51
  self.max_records_per_output_file = 2
52
+ self.s3_table_writer_kwargs = {}
52
53
  self.repartitioned_file_content_type = ContentType.PARQUET
53
54
  self.deltacat_storage = MagicMock()
54
55
  self.deltacat_storage_kwargs = MagicMock()
@@ -59,6 +60,7 @@ class TestRepartitionRange(unittest.TestCase):
59
60
  self.destination_partition,
60
61
  self.repartition_args,
61
62
  self.max_records_per_output_file,
63
+ self.s3_table_writer_kwargs,
62
64
  self.repartitioned_file_content_type,
63
65
  self.deltacat_storage,
64
66
  self.deltacat_storage_kwargs,
@@ -85,6 +87,7 @@ class TestRepartitionRange(unittest.TestCase):
85
87
  self.destination_partition,
86
88
  self.repartition_args,
87
89
  self.max_records_per_output_file,
90
+ self.s3_table_writer_kwargs,
88
91
  self.repartitioned_file_content_type,
89
92
  self.deltacat_storage,
90
93
  self.deltacat_storage_kwargs,
@@ -98,6 +101,7 @@ class TestRepartitionRange(unittest.TestCase):
98
101
  self.destination_partition,
99
102
  self.repartition_args,
100
103
  self.max_records_per_output_file,
104
+ self.s3_table_writer_kwargs,
101
105
  self.repartitioned_file_content_type,
102
106
  self.deltacat_storage,
103
107
  self.deltacat_storage_kwargs,
@@ -110,6 +114,7 @@ class TestRepartitionRange(unittest.TestCase):
110
114
  self.destination_partition,
111
115
  self.repartition_args,
112
116
  self.max_records_per_output_file,
117
+ self.s3_table_writer_kwargs,
113
118
  self.repartitioned_file_content_type,
114
119
  self.deltacat_storage,
115
120
  self.deltacat_storage_kwargs,
@@ -123,6 +128,7 @@ class TestRepartitionRange(unittest.TestCase):
123
128
  self.destination_partition,
124
129
  self.repartition_args,
125
130
  self.max_records_per_output_file,
131
+ self.s3_table_writer_kwargs,
126
132
  self.repartitioned_file_content_type,
127
133
  self.deltacat_storage,
128
134
  self.deltacat_storage_kwargs,
@@ -137,6 +143,7 @@ class TestRepartitionRange(unittest.TestCase):
137
143
  self.destination_partition,
138
144
  self.repartition_args,
139
145
  self.max_records_per_output_file,
146
+ self.s3_table_writer_kwargs,
140
147
  self.repartitioned_file_content_type,
141
148
  self.deltacat_storage,
142
149
  self.deltacat_storage_kwargs,
@@ -151,6 +158,7 @@ class TestRepartitionRange(unittest.TestCase):
151
158
  self.destination_partition,
152
159
  self.repartition_args,
153
160
  self.max_records_per_output_file,
161
+ self.s3_table_writer_kwargs,
154
162
  self.repartitioned_file_content_type,
155
163
  self.deltacat_storage,
156
164
  self.deltacat_storage_kwargs,
@@ -167,6 +175,7 @@ class TestRepartitionRange(unittest.TestCase):
167
175
  self.destination_partition,
168
176
  self.repartition_args,
169
177
  self.max_records_per_output_file,
178
+ self.s3_table_writer_kwargs,
170
179
  self.repartitioned_file_content_type,
171
180
  self.deltacat_storage,
172
181
  self.deltacat_storage_kwargs,
@@ -180,6 +189,7 @@ class TestRepartitionRange(unittest.TestCase):
180
189
  self.destination_partition,
181
190
  self.repartition_args,
182
191
  self.max_records_per_output_file,
192
+ self.s3_table_writer_kwargs,
183
193
  self.repartitioned_file_content_type,
184
194
  self.deltacat_storage,
185
195
  self.deltacat_storage_kwargs,
@@ -196,6 +206,7 @@ class TestRepartitionRange(unittest.TestCase):
196
206
  self.destination_partition,
197
207
  self.repartition_args,
198
208
  self.max_records_per_output_file,
209
+ self.s3_table_writer_kwargs,
199
210
  self.repartitioned_file_content_type,
200
211
  self.deltacat_storage,
201
212
  ),
@@ -222,6 +233,7 @@ class TestRepartitionRange(unittest.TestCase):
222
233
  self.destination_partition,
223
234
  self.repartition_args,
224
235
  self.max_records_per_output_file,
236
+ self.s3_table_writer_kwargs,
225
237
  self.repartitioned_file_content_type,
226
238
  self.deltacat_storage,
227
239
  self.deltacat_storage_kwargs,
@@ -238,7 +238,8 @@ def test_compact_partition_incremental(
238
238
  rebase_source_partition_locator_param,
239
239
  partition_values_param,
240
240
  expected_result,
241
- validation_callback_func, # use and implement func and func_kwargs if you want to run additional validations apart from the ones in the test
241
+ # use and implement func and func_kwargs if you want to run additional validations apart from the ones in the test
242
+ validation_callback_func,
242
243
  validation_callback_func_kwargs,
243
244
  do_teardown_local_deltacat_storage_db,
244
245
  use_prev_compacted,
@@ -281,6 +282,10 @@ def test_compact_partition_incremental(
281
282
  arrow_arrays_param,
282
283
  partition_values_param,
283
284
  ds_mock_kwargs,
285
+ f"{test_name}_src_namespace",
286
+ f"{test_name}_table_src",
287
+ f"{test_name}_dest_namespace",
288
+ f"{test_name}_table_dest",
284
289
  )
285
290
  ray.shutdown()
286
291
  ray.init(local_mode=True)
@@ -334,6 +339,16 @@ def test_compact_partition_incremental(
334
339
  compacted_delta_locator = round_completion_info.compacted_delta_locator
335
340
  tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
336
341
  compacted_table = pa.concat_tables(tables)
342
+
343
+ # the compacted table may contain multiple files and chunks
344
+ # and order of records may be incorrect due to multiple files.
345
+ expected_result = expected_result.combine_chunks().sort_by(
346
+ [(val, "ascending") for val in primary_keys_param]
347
+ )
348
+ compacted_table = compacted_table.combine_chunks().sort_by(
349
+ [(val, "ascending") for val in primary_keys_param]
350
+ )
351
+
337
352
  assert compacted_table.equals(
338
353
  expected_result
339
354
  ), f"{compacted_table} does not match {expected_result}"
@@ -12,13 +12,18 @@ from deltacat.tests.compute.common import (
12
12
  from deltacat.compute.compactor.compaction_session import (
13
13
  compact_partition_from_request as compact_partition_v1,
14
14
  )
15
+ from deltacat.compute.compactor_v2.compaction_session import (
16
+ compact_partition as compact_partition_v2,
17
+ )
15
18
 
16
19
 
17
20
  def create_tests_cases_for_all_compactor_versions(test_cases: Dict[str, List]):
18
21
  final_cases = {}
19
- for version, compact_partition_func in enumerate([compact_partition_v1]):
22
+ for version, compact_partition_func in enumerate(
23
+ [compact_partition_v1, compact_partition_v2]
24
+ ):
20
25
  for case_name, case_value in test_cases.items():
21
- final_cases[f"{case_name}_v{version}"] = [
26
+ final_cases[f"{case_name}_v{version + 1}"] = [
22
27
  *case_value,
23
28
  compact_partition_func,
24
29
  ]
@@ -1,24 +1,42 @@
1
1
  from typing import List
2
2
  import pyarrow as pa
3
- from deltacat.storage import Delta
3
+ from deltacat.storage import Delta, Partition
4
4
  import deltacat.tests.local_deltacat_storage as ds
5
5
 
6
6
 
7
7
  def create_delta_from_csv_file(
8
8
  namespace: str, file_paths: List[str], *args, **kwargs
9
9
  ) -> Delta:
10
- tables = []
10
+ staged_partition = stage_partition_from_csv_file(
11
+ namespace, file_paths, *args, **kwargs
12
+ )
11
13
 
12
- for file_path in file_paths:
13
- table = pa.csv.read_csv(file_path)
14
- tables.append(table)
14
+ committed_delta = commit_delta_to_staged_partition(
15
+ staged_partition, file_paths, *args, **kwargs
16
+ )
15
17
 
18
+ return committed_delta
19
+
20
+
21
+ def stage_partition_from_csv_file(
22
+ namespace: str, file_paths: List[str], *args, **kwargs
23
+ ) -> Partition:
16
24
  ds.create_namespace(namespace, {}, **kwargs)
17
25
  table_name = "-".join(file_paths).replace("/", "_")
18
26
  ds.create_table_version(namespace, table_name, "1", **kwargs)
19
27
  stream = ds.get_stream(namespace, table_name, "1", **kwargs)
20
28
  staged_partition = ds.stage_partition(stream, [], **kwargs)
29
+ return staged_partition
21
30
 
31
+
32
+ def commit_delta_to_staged_partition(
33
+ staged_partition, file_paths: List[str], *args, **kwargs
34
+ ) -> Delta:
35
+ tables = []
36
+
37
+ for file_path in file_paths:
38
+ table = pa.csv.read_csv(file_path)
39
+ tables.append(table)
22
40
  deltas = []
23
41
 
24
42
  for table in tables:
@@ -28,5 +46,4 @@ def create_delta_from_csv_file(
28
46
  merged_delta = Delta.merge_deltas(deltas=deltas)
29
47
  committed_delta = ds.commit_delta(merged_delta, **kwargs)
30
48
  ds.commit_partition(staged_partition, **kwargs)
31
-
32
49
  return committed_delta
@@ -38,6 +38,7 @@ class PartialParquetParameters(PartialFileDownloadParams):
38
38
  num_row_groups = pq_metadata.num_row_groups
39
39
  row_groups_to_download = [rg for rg in range(num_row_groups)]
40
40
  in_memory_size_bytes = 0.0
41
+ num_rows = pq_metadata.num_rows
41
42
 
42
43
  for rg in row_groups_to_download:
43
44
  row_group_meta = pq_metadata.row_group(rg)
deltacat/types/tables.py CHANGED
@@ -4,6 +4,7 @@ from typing import Callable, Dict, Type, Union
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
  import pyarrow as pa
7
+ import pyarrow.parquet as papq
7
8
  from ray.data.dataset import Dataset
8
9
  from ray.data.read_api import (
9
10
  from_arrow,
@@ -49,6 +50,7 @@ TABLE_CLASS_TO_SIZE_FUNC: Dict[
49
50
  Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
50
51
  ] = {
51
52
  pa.Table: pa_utils.table_size,
53
+ papq.ParquetFile: pa_utils.parquet_file_size,
52
54
  pd.DataFrame: pd_utils.dataframe_size,
53
55
  np.ndarray: np_utils.ndarray_size,
54
56
  Dataset: ds_utils.dataset_size,
@@ -56,18 +58,21 @@ TABLE_CLASS_TO_SIZE_FUNC: Dict[
56
58
 
57
59
  TABLE_CLASS_TO_TABLE_TYPE: Dict[Type[dcs.LocalTable], str] = {
58
60
  pa.Table: TableType.PYARROW.value,
61
+ papq.ParquetFile: TableType.PYARROW_PARQUET.value,
59
62
  pd.DataFrame: TableType.PANDAS.value,
60
63
  np.ndarray: TableType.NUMPY.value,
61
64
  }
62
65
 
63
66
  TABLE_TYPE_TO_DATASET_CREATE_FUNC: Dict[str, Callable] = {
64
67
  TableType.PYARROW.value: from_arrow,
68
+ TableType.PYARROW_PARQUET.value: from_arrow,
65
69
  TableType.NUMPY.value: from_numpy,
66
70
  TableType.PANDAS.value: from_pandas,
67
71
  }
68
72
 
69
73
  TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
70
74
  TableType.PYARROW.value: from_arrow_refs,
75
+ TableType.PYARROW_PARQUET.value: from_arrow_refs,
71
76
  TableType.NUMPY.value: from_numpy,
72
77
  TableType.PANDAS.value: from_pandas_refs,
73
78
  }
@@ -1,5 +1,4 @@
1
1
  import inspect
2
- import copy
3
2
  from typing import Any, Dict
4
3
 
5
4
 
@@ -13,7 +12,7 @@ def sanitize_kwargs_to_callable(callable: Any, kwargs: Dict) -> Dict:
13
12
  signature = inspect.signature(callable)
14
13
  params = signature.parameters
15
14
 
16
- new_kwargs = copy.copy(kwargs)
15
+ new_kwargs = {**kwargs}
17
16
 
18
17
  for key in params:
19
18
  if params[key].kind == inspect.Parameter.VAR_KEYWORD:
deltacat/utils/pyarrow.py CHANGED
@@ -294,6 +294,7 @@ def s3_partial_parquet_file_to_table(
294
294
  content_type=content_type,
295
295
  content_encoding=content_encoding,
296
296
  partial_file_download_params=partial_file_download_params,
297
+ pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
297
298
  **s3_client_kwargs,
298
299
  )
299
300
 
@@ -488,6 +489,10 @@ def table_size(table: pa.Table) -> int:
488
489
  return table.nbytes
489
490
 
490
491
 
492
+ def parquet_file_size(table: papq.ParquetFile) -> int:
493
+ return table.metadata.serialized_size
494
+
495
+
491
496
  def table_to_file(
492
497
  table: pa.Table,
493
498
  base_path: str,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.1.18b15
3
+ Version: 0.1.18b16
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team