deltacat 0.1.18b17__py3-none-any.whl → 0.1.18b19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +3 -3
- deltacat/aws/constants.py +2 -1
- deltacat/compute/compactor/model/compact_partition_params.py +9 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -0
- deltacat/compute/compactor/model/delta_annotated.py +7 -0
- deltacat/compute/compactor_v2/compaction_session.py +6 -0
- deltacat/compute/compactor_v2/constants.py +7 -2
- deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -1
- deltacat/compute/compactor_v2/utils/content_type_params.py +1 -1
- deltacat/compute/compactor_v2/utils/task_options.py +22 -5
- deltacat/compute/metastats/stats.py +2 -1
- deltacat/io/memcached_object_store.py +8 -3
- deltacat/tests/utils/test_pyarrow.py +131 -44
- deltacat/tests/utils/test_resources.py +2 -0
- deltacat/utils/pyarrow.py +40 -77
- deltacat/utils/resources.py +25 -8
- {deltacat-0.1.18b17.dist-info → deltacat-0.1.18b19.dist-info}/METADATA +2 -2
- {deltacat-0.1.18b17.dist-info → deltacat-0.1.18b19.dist-info}/RECORD +22 -22
- {deltacat-0.1.18b17.dist-info → deltacat-0.1.18b19.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b17.dist-info → deltacat-0.1.18b19.dist-info}/WHEEL +0 -0
- {deltacat-0.1.18b17.dist-info → deltacat-0.1.18b19.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
deltacat/aws/clients.py
CHANGED
@@ -109,7 +109,7 @@ def block_until_instance_metadata_service_returns_success(
|
|
109
109
|
url=INSTANCE_METADATA_SERVICE_IPV4_URI,
|
110
110
|
retry_strategy=RetryIfRetryableHTTPStatusCode,
|
111
111
|
wait_strategy=wait_fixed(2), # wait 2 seconds before retrying,
|
112
|
-
stop_strategy=stop_after_delay(60 *
|
112
|
+
stop_strategy=stop_after_delay(60 * 30), # stop trying after 30 minutes
|
113
113
|
) -> Optional[Response]:
|
114
114
|
"""Blocks until the instance metadata service returns a successful response.
|
115
115
|
|
@@ -151,10 +151,10 @@ def _resource(name: str, region: Optional[str], **kwargs) -> ServiceResource:
|
|
151
151
|
boto3_session = _get_session_from_kwargs(kwargs)
|
152
152
|
|
153
153
|
boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
|
154
|
+
kwargs = {"config": boto_config, **kwargs}
|
154
155
|
return boto3_session.resource(
|
155
156
|
name,
|
156
157
|
region,
|
157
|
-
config=boto_config,
|
158
158
|
**kwargs,
|
159
159
|
)
|
160
160
|
|
@@ -169,10 +169,10 @@ def _client(name: str, region: Optional[str], **kwargs) -> BaseClient:
|
|
169
169
|
boto_config = Config(
|
170
170
|
retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"}
|
171
171
|
)
|
172
|
+
kwargs = {"config": boto_config, **kwargs}
|
172
173
|
return boto3_session.client(
|
173
174
|
name,
|
174
175
|
region,
|
175
|
-
config=boto_config,
|
176
176
|
**kwargs,
|
177
177
|
)
|
178
178
|
|
deltacat/aws/constants.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import List
|
2
2
|
|
3
|
-
from deltacat.utils.common import env_integer
|
3
|
+
from deltacat.utils.common import env_integer, env_string
|
4
4
|
|
5
5
|
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 15)
|
6
6
|
TIMEOUT_ERROR_CODES: List[str] = ["ReadTimeoutError", "ConnectTimeoutError"]
|
7
|
+
AWS_REGION = env_string("AWS_REGION", "us-east-1")
|
@@ -90,6 +90,7 @@ class CompactPartitionParams(dict):
|
|
90
90
|
"hash_group_count", result.hash_bucket_count
|
91
91
|
)
|
92
92
|
result.drop_duplicates = params.get("drop_duplicates", DROP_DUPLICATES)
|
93
|
+
result.ray_custom_resources = params.get("ray_custom_resources")
|
93
94
|
|
94
95
|
if not importlib.util.find_spec("memray"):
|
95
96
|
result.enable_profiler = False
|
@@ -288,6 +289,14 @@ class CompactPartitionParams(dict):
|
|
288
289
|
def hash_group_count(self) -> int:
|
289
290
|
return self["hash_group_count"]
|
290
291
|
|
292
|
+
@property
|
293
|
+
def ray_custom_resources(self) -> Dict:
|
294
|
+
return self["ray_custom_resources"]
|
295
|
+
|
296
|
+
@ray_custom_resources.setter
|
297
|
+
def ray_custom_resources(self, res) -> None:
|
298
|
+
self["ray_custom_resources"] = res
|
299
|
+
|
291
300
|
@hash_group_count.setter
|
292
301
|
def hash_group_count(self, count: int) -> None:
|
293
302
|
self["hash_group_count"] = count
|
@@ -420,6 +420,21 @@ class CompactionSessionAuditInfo(dict):
|
|
420
420
|
"""
|
421
421
|
return self.get("usedCPUSeconds")
|
422
422
|
|
423
|
+
@property
|
424
|
+
def used_memory_gb_seconds(self) -> float:
|
425
|
+
"""
|
426
|
+
The used memory in the cluster weighted over time. This
|
427
|
+
determines opportunities for better memory estimation.
|
428
|
+
"""
|
429
|
+
return self.get("usedMemoryGBSeconds")
|
430
|
+
|
431
|
+
@property
|
432
|
+
def total_memory_gb_seconds(self) -> float:
|
433
|
+
"""
|
434
|
+
Total memory in the cluster weighted over time in GB.
|
435
|
+
"""
|
436
|
+
return self.get("totalMemoryGBSeconds")
|
437
|
+
|
423
438
|
@property
|
424
439
|
def pyarrow_version(self) -> str:
|
425
440
|
"""
|
@@ -743,6 +758,14 @@ class CompactionSessionAuditInfo(dict):
|
|
743
758
|
self["usedCPUSeconds"] = value
|
744
759
|
return self
|
745
760
|
|
761
|
+
def set_used_memory_gb_seconds(self, value: float) -> CompactionSessionAuditInfo:
|
762
|
+
self["usedMemoryGBSeconds"] = value
|
763
|
+
return self
|
764
|
+
|
765
|
+
def set_total_memory_gb_seconds(self, value: float) -> CompactionSessionAuditInfo:
|
766
|
+
self["totalMemoryGBSeconds"] = value
|
767
|
+
return self
|
768
|
+
|
746
769
|
def set_pyarrow_version(self, value: str) -> CompactionSessionAuditInfo:
|
747
770
|
self["pyarrowVersion"] = value
|
748
771
|
return self
|
@@ -286,8 +286,15 @@ class DeltaAnnotated(Delta):
|
|
286
286
|
|
287
287
|
result.append(new_da)
|
288
288
|
else:
|
289
|
+
logger.info(
|
290
|
+
f"Split was not performed on delta with locator: {delta_annotated.locator} "
|
291
|
+
"as partial parquet params was not found."
|
292
|
+
)
|
289
293
|
return [delta_annotated]
|
290
294
|
|
295
|
+
if result:
|
296
|
+
return result
|
297
|
+
|
291
298
|
logger.info(
|
292
299
|
f"Split was not performed on the delta with locator: {delta_annotated.locator}"
|
293
300
|
)
|
@@ -217,6 +217,7 @@ def _execute_compaction(
|
|
217
217
|
previous_inflation=params.previous_inflation,
|
218
218
|
average_record_size_bytes=params.average_record_size_bytes,
|
219
219
|
primary_keys=params.primary_keys,
|
220
|
+
ray_custom_resources=params.ray_custom_resources,
|
220
221
|
)
|
221
222
|
|
222
223
|
hb_start = time.monotonic()
|
@@ -337,6 +338,7 @@ def _execute_compaction(
|
|
337
338
|
primary_keys=params.primary_keys,
|
338
339
|
deltacat_storage=params.deltacat_storage,
|
339
340
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
341
|
+
ray_custom_resources=params.ray_custom_resources,
|
340
342
|
)
|
341
343
|
|
342
344
|
def merge_input_provider(index, item):
|
@@ -479,6 +481,10 @@ def _execute_compaction(
|
|
479
481
|
if cluster_util:
|
480
482
|
compaction_audit.set_total_cpu_seconds(cluster_util.total_vcpu_seconds)
|
481
483
|
compaction_audit.set_used_cpu_seconds(cluster_util.used_vcpu_seconds)
|
484
|
+
compaction_audit.set_used_memory_gb_seconds(cluster_util.used_memory_gb_seconds)
|
485
|
+
compaction_audit.set_total_memory_gb_seconds(
|
486
|
+
cluster_util.total_memory_gb_seconds
|
487
|
+
)
|
482
488
|
|
483
489
|
s3_utils.upload(
|
484
490
|
compaction_audit.audit_url,
|
@@ -23,10 +23,10 @@ AVERAGE_RECORD_SIZE_BYTES = 1000
|
|
23
23
|
# r5.8xlarge EC2 instances.
|
24
24
|
TASK_MAX_PARALLELISM = 5367
|
25
25
|
|
26
|
-
# The percentage of memory that needs to be
|
26
|
+
# The percentage of memory that needs to be allocated
|
27
27
|
# as buffer. This value will ensure the job doesn't run out
|
28
28
|
# of memory by considering buffer for uncertainities.
|
29
|
-
TOTAL_MEMORY_BUFFER_PERCENTAGE =
|
29
|
+
TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
|
30
30
|
|
31
31
|
# The total size of records that will be hash bucketed at once
|
32
32
|
# Since, sorting is nlogn, we ensure that is not performed
|
@@ -35,3 +35,8 @@ MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
|
|
35
35
|
|
36
36
|
# Whether to drop duplicates during merge.
|
37
37
|
DROP_DUPLICATES = True
|
38
|
+
|
39
|
+
# PARQUET to PYARROW inflation multiplier
|
40
|
+
# This is the observed upper bound inflation for parquet
|
41
|
+
# size in metadata to pyarrow table size.
|
42
|
+
PARQUET_TO_PYARROW_INFLATION = 4
|
@@ -109,7 +109,10 @@ def _group_file_records_by_pk_hash_bucket(
|
|
109
109
|
if delta_file_envelopes is None:
|
110
110
|
return None, 0, 0
|
111
111
|
|
112
|
-
logger.info(
|
112
|
+
logger.info(
|
113
|
+
f"Read all delta file envelopes: {len(delta_file_envelopes)} "
|
114
|
+
f"and total_size_bytes={total_size_bytes} and records={total_record_count}"
|
115
|
+
)
|
113
116
|
|
114
117
|
# group the data by primary key hash value
|
115
118
|
hb_to_delta_file_envelopes = np.empty([num_hash_buckets], dtype="object")
|
@@ -11,7 +11,10 @@ from deltacat.compute.compactor.model.round_completion_info import RoundCompleti
|
|
11
11
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
12
12
|
hash_group_index_to_hash_bucket_indices,
|
13
13
|
)
|
14
|
-
from deltacat.compute.compactor_v2.constants import
|
14
|
+
from deltacat.compute.compactor_v2.constants import (
|
15
|
+
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
16
|
+
PARQUET_TO_PYARROW_INFLATION,
|
17
|
+
)
|
15
18
|
|
16
19
|
|
17
20
|
def _get_parquet_type_params_if_exist(
|
@@ -45,7 +48,19 @@ def _calculate_parquet_column_size(
|
|
45
48
|
"Columns not found in the parquet data as "
|
46
49
|
f"{columns_found} != {len(columns)}"
|
47
50
|
)
|
48
|
-
return column_size
|
51
|
+
return column_size * PARQUET_TO_PYARROW_INFLATION
|
52
|
+
|
53
|
+
|
54
|
+
def _get_task_options(
|
55
|
+
cpu: float, memory: float, ray_custom_resources: Optional[Dict] = None
|
56
|
+
) -> Dict:
|
57
|
+
|
58
|
+
task_opts = {"num_cpus": cpu, "memory": memory}
|
59
|
+
|
60
|
+
if ray_custom_resources:
|
61
|
+
task_opts["resources"] = ray_custom_resources
|
62
|
+
|
63
|
+
return task_opts
|
49
64
|
|
50
65
|
|
51
66
|
def estimate_manifest_entry_size_bytes(
|
@@ -57,7 +72,7 @@ def estimate_manifest_entry_size_bytes(
|
|
57
72
|
type_params = _get_parquet_type_params_if_exist(entry=entry)
|
58
73
|
|
59
74
|
if type_params:
|
60
|
-
return type_params.in_memory_size_bytes
|
75
|
+
return type_params.in_memory_size_bytes * PARQUET_TO_PYARROW_INFLATION
|
61
76
|
|
62
77
|
return entry.meta.content_length * previous_inflation
|
63
78
|
|
@@ -103,6 +118,7 @@ def hash_bucket_resource_options_provider(
|
|
103
118
|
previous_inflation: float,
|
104
119
|
average_record_size_bytes: float,
|
105
120
|
primary_keys: List[str] = None,
|
121
|
+
ray_custom_resources: Optional[Dict] = None,
|
106
122
|
**kwargs,
|
107
123
|
) -> Dict:
|
108
124
|
size_bytes = 0.0
|
@@ -141,7 +157,7 @@ def hash_bucket_resource_options_provider(
|
|
141
157
|
# Consider buffer
|
142
158
|
total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
|
143
159
|
|
144
|
-
return
|
160
|
+
return _get_task_options(0.01, total_memory, ray_custom_resources)
|
145
161
|
|
146
162
|
|
147
163
|
def merge_resource_options_provider(
|
@@ -152,6 +168,7 @@ def merge_resource_options_provider(
|
|
152
168
|
hash_group_num_rows: Dict[int, int],
|
153
169
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
154
170
|
compacted_delta_manifest: Optional[Manifest] = None,
|
171
|
+
ray_custom_resources: Optional[Dict] = None,
|
155
172
|
primary_keys: Optional[List[str]] = None,
|
156
173
|
deltacat_storage=unimplemented_deltacat_storage,
|
157
174
|
deltacat_storage_kwargs: Optional[Dict] = {},
|
@@ -218,4 +235,4 @@ def merge_resource_options_provider(
|
|
218
235
|
|
219
236
|
total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
|
220
237
|
|
221
|
-
return
|
238
|
+
return _get_task_options(0.01, total_memory, ray_custom_resources)
|
@@ -8,6 +8,7 @@ from ray.types import ObjectRef
|
|
8
8
|
from deltacat import logs
|
9
9
|
from deltacat.aws import s3u as s3_utils
|
10
10
|
from deltacat.aws.clients import client_cache
|
11
|
+
from deltacat.aws.constants import AWS_REGION
|
11
12
|
from deltacat.compute.compactor import DeltaAnnotated
|
12
13
|
from deltacat.compute.metastats.utils.io import (
|
13
14
|
cache_inflation_rate_data_for_delta_stats_ready,
|
@@ -94,7 +95,7 @@ def start_stats_collection(
|
|
94
95
|
|
95
96
|
|
96
97
|
def _get_account_id() -> str:
|
97
|
-
client = client_cache("sts",
|
98
|
+
client = client_cache("sts", region_name=AWS_REGION)
|
98
99
|
account_id = client.get_caller_identity()["Account"]
|
99
100
|
return account_id
|
100
101
|
|
@@ -29,6 +29,7 @@ class MemcachedObjectStore(IObjectStore):
|
|
29
29
|
self.port = port
|
30
30
|
self.storage_node_ips = storage_node_ips
|
31
31
|
self.hasher = None
|
32
|
+
logger.info(f"The storage node IPs: {self.storage_node_ips}")
|
32
33
|
super().__init__()
|
33
34
|
|
34
35
|
def initialize_hasher(self):
|
@@ -129,9 +130,13 @@ class MemcachedObjectStore(IObjectStore):
|
|
129
130
|
base_client = Client((ip_address, self.port))
|
130
131
|
client = RetryingClient(
|
131
132
|
base_client,
|
132
|
-
attempts=
|
133
|
-
retry_delay=
|
134
|
-
retry_for=[
|
133
|
+
attempts=15,
|
134
|
+
retry_delay=1,
|
135
|
+
retry_for=[
|
136
|
+
MemcacheUnexpectedCloseError,
|
137
|
+
ConnectionResetError,
|
138
|
+
BrokenPipeError,
|
139
|
+
],
|
135
140
|
)
|
136
141
|
|
137
142
|
self.client_cache[ip_address] = client
|
@@ -1,10 +1,10 @@
|
|
1
1
|
from unittest import TestCase
|
2
2
|
from deltacat.utils.pyarrow import (
|
3
|
-
s3_parquet_file_to_table,
|
4
3
|
s3_partial_parquet_file_to_table,
|
5
4
|
pyarrow_read_csv,
|
6
5
|
content_type_to_reader_kwargs,
|
7
6
|
_add_column_kwargs,
|
7
|
+
s3_file_to_table,
|
8
8
|
ReadKwargsProviderPyArrowSchemaOverride,
|
9
9
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
10
|
)
|
@@ -16,49 +16,8 @@ import pyarrow as pa
|
|
16
16
|
PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
|
17
17
|
EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
|
18
18
|
NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
|
19
|
-
|
20
|
-
|
21
|
-
class TestS3ParquetFileToTable(TestCase):
|
22
|
-
def test_s3_parquet_file_to_table_sanity(self):
|
23
|
-
|
24
|
-
result = s3_parquet_file_to_table(
|
25
|
-
PARQUET_FILE_PATH,
|
26
|
-
ContentType.PARQUET.value,
|
27
|
-
ContentEncoding.IDENTITY.value,
|
28
|
-
["n_legs", "animal"],
|
29
|
-
["n_legs"],
|
30
|
-
)
|
31
|
-
|
32
|
-
self.assertEqual(len(result), 6)
|
33
|
-
self.assertEqual(len(result.column_names), 1)
|
34
|
-
schema = result.schema
|
35
|
-
schema_index = schema.get_field_index("n_legs")
|
36
|
-
self.assertEqual(schema.field(schema_index).type, "int64")
|
37
|
-
|
38
|
-
def test_s3_parquet_file_to_table_when_schema_overridden(self):
|
39
|
-
|
40
|
-
schema = pa.schema(
|
41
|
-
[pa.field("animal", pa.string()), pa.field("n_legs", pa.string())]
|
42
|
-
)
|
43
|
-
|
44
|
-
pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
|
45
|
-
|
46
|
-
result = s3_parquet_file_to_table(
|
47
|
-
PARQUET_FILE_PATH,
|
48
|
-
ContentType.PARQUET.value,
|
49
|
-
ContentEncoding.IDENTITY.value,
|
50
|
-
["n_legs", "animal"],
|
51
|
-
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
52
|
-
)
|
53
|
-
|
54
|
-
self.assertEqual(len(result), 6)
|
55
|
-
self.assertEqual(len(result.column_names), 2)
|
56
|
-
|
57
|
-
result_schema = result.schema
|
58
|
-
for index, field in enumerate(result_schema):
|
59
|
-
self.assertEqual(field.name, schema.field(index).name)
|
60
|
-
|
61
|
-
self.assertEqual(result.schema.field(1).type, "string")
|
19
|
+
GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
|
20
|
+
BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
|
62
21
|
|
63
22
|
|
64
23
|
class TestS3PartialParquetFileToTable(TestCase):
|
@@ -343,3 +302,131 @@ class TestReadCSV(TestCase):
|
|
343
302
|
EMPTY_UTSV_PATH, **{**kwargs, RAISE_ON_EMPTY_CSV_KWARG: True}
|
344
303
|
),
|
345
304
|
)
|
305
|
+
|
306
|
+
|
307
|
+
class TestS3FileToTable(TestCase):
|
308
|
+
def test_s3_file_to_table_identity_sanity(self):
|
309
|
+
|
310
|
+
schema = pa.schema(
|
311
|
+
[("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
|
312
|
+
)
|
313
|
+
|
314
|
+
result = s3_file_to_table(
|
315
|
+
NON_EMPTY_VALID_UTSV_PATH,
|
316
|
+
ContentType.UNESCAPED_TSV.value,
|
317
|
+
ContentEncoding.IDENTITY.value,
|
318
|
+
["is_active", "ship_datetime_utc"],
|
319
|
+
None,
|
320
|
+
pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
|
321
|
+
schema=schema
|
322
|
+
),
|
323
|
+
)
|
324
|
+
|
325
|
+
self.assertEqual(len(result), 3)
|
326
|
+
self.assertEqual(len(result.column_names), 2)
|
327
|
+
result_schema = result.schema
|
328
|
+
for index, field in enumerate(result_schema):
|
329
|
+
self.assertEqual(field.name, schema.field(index).name)
|
330
|
+
|
331
|
+
self.assertEqual(result.schema.field(0).type, "string")
|
332
|
+
|
333
|
+
def test_s3_file_to_table_gzip_compressed_sanity(self):
|
334
|
+
|
335
|
+
schema = pa.schema(
|
336
|
+
[("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
|
337
|
+
)
|
338
|
+
|
339
|
+
result = s3_file_to_table(
|
340
|
+
GZIP_COMPRESSED_FILE_UTSV_PATH,
|
341
|
+
ContentType.UNESCAPED_TSV.value,
|
342
|
+
ContentEncoding.GZIP.value,
|
343
|
+
["is_active", "ship_datetime_utc"],
|
344
|
+
None,
|
345
|
+
pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
|
346
|
+
schema=schema
|
347
|
+
),
|
348
|
+
)
|
349
|
+
|
350
|
+
self.assertEqual(len(result), 3)
|
351
|
+
self.assertEqual(len(result.column_names), 2)
|
352
|
+
result_schema = result.schema
|
353
|
+
for index, field in enumerate(result_schema):
|
354
|
+
self.assertEqual(field.name, schema.field(index).name)
|
355
|
+
|
356
|
+
self.assertEqual(result.schema.field(0).type, "string")
|
357
|
+
|
358
|
+
def test_s3_file_to_table_bz2_compressed_sanity(self):
|
359
|
+
|
360
|
+
schema = pa.schema(
|
361
|
+
[("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
|
362
|
+
)
|
363
|
+
|
364
|
+
result = s3_file_to_table(
|
365
|
+
BZ2_COMPRESSED_FILE_UTSV_PATH,
|
366
|
+
ContentType.UNESCAPED_TSV.value,
|
367
|
+
ContentEncoding.BZIP2.value,
|
368
|
+
["is_active", "ship_datetime_utc"],
|
369
|
+
None,
|
370
|
+
pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
|
371
|
+
schema=schema
|
372
|
+
),
|
373
|
+
)
|
374
|
+
|
375
|
+
self.assertEqual(len(result), 3)
|
376
|
+
self.assertEqual(len(result.column_names), 2)
|
377
|
+
result_schema = result.schema
|
378
|
+
for index, field in enumerate(result_schema):
|
379
|
+
self.assertEqual(field.name, schema.field(index).name)
|
380
|
+
|
381
|
+
self.assertEqual(result.schema.field(0).type, "string")
|
382
|
+
|
383
|
+
def test_s3_file_to_table_when_parquet_sanity(self):
|
384
|
+
|
385
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
386
|
+
"reader_type": "pyarrow",
|
387
|
+
**kwargs,
|
388
|
+
}
|
389
|
+
|
390
|
+
result = s3_file_to_table(
|
391
|
+
PARQUET_FILE_PATH,
|
392
|
+
ContentType.PARQUET.value,
|
393
|
+
ContentEncoding.IDENTITY.value,
|
394
|
+
["n_legs", "animal"],
|
395
|
+
["n_legs"],
|
396
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
397
|
+
)
|
398
|
+
|
399
|
+
self.assertEqual(len(result), 6)
|
400
|
+
self.assertEqual(len(result.column_names), 1)
|
401
|
+
schema = result.schema
|
402
|
+
schema_index = schema.get_field_index("n_legs")
|
403
|
+
self.assertEqual(schema.field(schema_index).type, "int64")
|
404
|
+
|
405
|
+
def test_s3_file_to_table_when_parquet_schema_overridden(self):
|
406
|
+
|
407
|
+
schema = pa.schema(
|
408
|
+
[pa.field("animal", pa.string()), pa.field("n_legs", pa.string())]
|
409
|
+
)
|
410
|
+
|
411
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
412
|
+
"schema": schema,
|
413
|
+
"reader_type": "pyarrow",
|
414
|
+
**kwargs,
|
415
|
+
}
|
416
|
+
|
417
|
+
result = s3_file_to_table(
|
418
|
+
PARQUET_FILE_PATH,
|
419
|
+
ContentType.PARQUET.value,
|
420
|
+
ContentEncoding.IDENTITY.value,
|
421
|
+
["n_legs", "animal"],
|
422
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
423
|
+
)
|
424
|
+
|
425
|
+
self.assertEqual(len(result), 6)
|
426
|
+
self.assertEqual(len(result.column_names), 2)
|
427
|
+
|
428
|
+
result_schema = result.schema
|
429
|
+
for index, field in enumerate(result_schema):
|
430
|
+
self.assertEqual(field.name, schema.field(index).name)
|
431
|
+
|
432
|
+
self.assertEqual(result.schema.field(1).type, "string")
|
deltacat/utils/pyarrow.py
CHANGED
@@ -41,6 +41,7 @@ from deltacat.utils.arguments import (
|
|
41
41
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
42
42
|
|
43
43
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
44
|
+
READER_TYPE_KWARG = "reader_type"
|
44
45
|
|
45
46
|
|
46
47
|
def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
|
@@ -173,9 +174,9 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
|
|
173
174
|
|
174
175
|
# TODO (pdames): add deflate and snappy
|
175
176
|
ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
|
176
|
-
ContentEncoding.GZIP.value: partial(gzip.
|
177
|
-
ContentEncoding.BZIP2.value: partial(bz2.
|
178
|
-
ContentEncoding.IDENTITY.value: lambda
|
177
|
+
ContentEncoding.GZIP.value: partial(gzip.open, mode="rb"),
|
178
|
+
ContentEncoding.BZIP2.value: partial(bz2.open, mode="rb"),
|
179
|
+
ContentEncoding.IDENTITY.value: lambda s3_file: s3_file,
|
179
180
|
}
|
180
181
|
|
181
182
|
|
@@ -385,47 +386,6 @@ def s3_partial_parquet_file_to_table(
|
|
385
386
|
return table
|
386
387
|
|
387
388
|
|
388
|
-
def s3_parquet_file_to_table(
|
389
|
-
s3_url: str,
|
390
|
-
content_type: str,
|
391
|
-
content_encoding: str,
|
392
|
-
column_names: Optional[List[str]] = None,
|
393
|
-
include_columns: Optional[List[str]] = None,
|
394
|
-
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
395
|
-
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
396
|
-
**s3_client_kwargs,
|
397
|
-
) -> pa.Table:
|
398
|
-
|
399
|
-
logger.debug(
|
400
|
-
f"Reading to Parquet table using read_table for {content_type} "
|
401
|
-
f"and encoding: {content_encoding}"
|
402
|
-
)
|
403
|
-
|
404
|
-
if s3_client_kwargs is None:
|
405
|
-
s3_client_kwargs = {}
|
406
|
-
|
407
|
-
kwargs = {}
|
408
|
-
|
409
|
-
if s3_url.startswith("s3://"):
|
410
|
-
s3_file_system = create_s3_file_system(s3_client_kwargs)
|
411
|
-
kwargs["filesystem"] = s3_file_system
|
412
|
-
|
413
|
-
_add_column_kwargs(
|
414
|
-
content_type=content_type,
|
415
|
-
column_names=column_names,
|
416
|
-
include_columns=include_columns,
|
417
|
-
kwargs=kwargs,
|
418
|
-
)
|
419
|
-
|
420
|
-
if pa_read_func_kwargs_provider:
|
421
|
-
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
422
|
-
|
423
|
-
table, latency = timed_invocation(papq.read_table, s3_url, **kwargs)
|
424
|
-
|
425
|
-
logger.debug(f"Successfully read the table from url={s3_url} in {latency}s")
|
426
|
-
return table
|
427
|
-
|
428
|
-
|
429
389
|
def s3_file_to_table(
|
430
390
|
s3_url: str,
|
431
391
|
content_type: str,
|
@@ -437,13 +397,17 @@ def s3_file_to_table(
|
|
437
397
|
**s3_client_kwargs,
|
438
398
|
) -> pa.Table:
|
439
399
|
|
440
|
-
from deltacat.aws import s3u as s3_utils
|
441
|
-
|
442
400
|
logger.debug(
|
443
401
|
f"Reading {s3_url} to PyArrow. Content type: {content_type}. "
|
444
402
|
f"Encoding: {content_encoding}"
|
445
403
|
)
|
446
404
|
|
405
|
+
kwargs = content_type_to_reader_kwargs(content_type)
|
406
|
+
_add_column_kwargs(content_type, column_names, include_columns, kwargs)
|
407
|
+
|
408
|
+
if pa_read_func_kwargs_provider is not None:
|
409
|
+
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
410
|
+
|
447
411
|
if (
|
448
412
|
content_type == ContentType.PARQUET.value
|
449
413
|
and content_encoding == ContentEncoding.IDENTITY.value
|
@@ -452,47 +416,46 @@ def s3_file_to_table(
|
|
452
416
|
f"Performing read using parquet reader for encoding={content_encoding} "
|
453
417
|
f"and content_type={content_type}"
|
454
418
|
)
|
455
|
-
kwargs = {}
|
456
|
-
if pa_read_func_kwargs_provider is not None:
|
457
|
-
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
458
419
|
|
459
|
-
|
420
|
+
parquet_reader_func = None
|
421
|
+
if kwargs.get(READER_TYPE_KWARG, "daft") == "daft":
|
460
422
|
parquet_reader_func = daft_s3_file_to_table
|
461
423
|
elif partial_file_download_params and isinstance(
|
462
424
|
partial_file_download_params, PartialParquetParameters
|
463
425
|
):
|
464
426
|
parquet_reader_func = s3_partial_parquet_file_to_table
|
465
|
-
else:
|
466
|
-
parquet_reader_func = s3_parquet_file_to_table
|
467
|
-
|
468
|
-
return parquet_reader_func(
|
469
|
-
s3_url=s3_url,
|
470
|
-
content_type=content_type,
|
471
|
-
content_encoding=content_encoding,
|
472
|
-
column_names=column_names,
|
473
|
-
include_columns=include_columns,
|
474
|
-
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
475
|
-
partial_file_download_params=partial_file_download_params,
|
476
|
-
**s3_client_kwargs,
|
477
|
-
)
|
478
427
|
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
428
|
+
if parquet_reader_func is not None:
|
429
|
+
return parquet_reader_func(
|
430
|
+
s3_url=s3_url,
|
431
|
+
content_type=content_type,
|
432
|
+
content_encoding=content_encoding,
|
433
|
+
column_names=column_names,
|
434
|
+
include_columns=include_columns,
|
435
|
+
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
436
|
+
partial_file_download_params=partial_file_download_params,
|
437
|
+
**s3_client_kwargs,
|
438
|
+
)
|
484
439
|
|
485
|
-
|
486
|
-
|
487
|
-
_add_column_kwargs(content_type, column_names, include_columns, kwargs)
|
440
|
+
if READER_TYPE_KWARG in kwargs:
|
441
|
+
kwargs.pop(READER_TYPE_KWARG)
|
488
442
|
|
489
|
-
|
490
|
-
|
443
|
+
filesystem = io
|
444
|
+
if s3_url.startswith("s3://"):
|
445
|
+
filesystem = create_s3_file_system(s3_client_kwargs)
|
491
446
|
|
492
|
-
logger.debug(f"
|
493
|
-
|
494
|
-
|
495
|
-
|
447
|
+
logger.debug(f"Read S3 object from {s3_url} using filesystem: {filesystem}")
|
448
|
+
input_file_init = ENCODING_TO_FILE_INIT[content_encoding]
|
449
|
+
pa_read_func = CONTENT_TYPE_TO_PA_READ_FUNC[content_type]
|
450
|
+
|
451
|
+
with filesystem.open(s3_url, "rb") as s3_file, input_file_init(
|
452
|
+
s3_file
|
453
|
+
) as input_file:
|
454
|
+
args = [input_file]
|
455
|
+
logger.debug(f"Reading {s3_url} via {pa_read_func} with kwargs: {kwargs}")
|
456
|
+
table, latency = timed_invocation(pa_read_func, *args, **kwargs)
|
457
|
+
logger.debug(f"Time to read {s3_url} into PyArrow table: {latency}s")
|
458
|
+
return table
|
496
459
|
|
497
460
|
|
498
461
|
def s3_file_to_parquet(
|
deltacat/utils/resources.py
CHANGED
@@ -15,6 +15,7 @@ from resource import getrusage, RUSAGE_SELF
|
|
15
15
|
import platform
|
16
16
|
import psutil
|
17
17
|
import schedule
|
18
|
+
from deltacat.constants import BYTES_PER_GIBIBYTE
|
18
19
|
|
19
20
|
|
20
21
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -73,9 +74,11 @@ class ClusterUtilizationOverTimeRange(AbstractContextManager):
|
|
73
74
|
def __init__(self) -> None:
|
74
75
|
self.total_vcpu_seconds = 0.0
|
75
76
|
self.used_vcpu_seconds = 0.0
|
77
|
+
self.total_memory_gb_seconds = 0.0
|
78
|
+
self.used_memory_gb_seconds = 0.0
|
76
79
|
|
77
80
|
def __enter__(self) -> Any:
|
78
|
-
schedule.every().second.do(self.
|
81
|
+
schedule.every().second.do(self._update_resources)
|
79
82
|
self.stop_run_schedules = self._run_schedule()
|
80
83
|
return super().__enter__()
|
81
84
|
|
@@ -94,7 +97,7 @@ class ClusterUtilizationOverTimeRange(AbstractContextManager):
|
|
94
97
|
|
95
98
|
# It is not truely parallel(due to GIL Ref: https://wiki.python.org/moin/GlobalInterpreterLock)
|
96
99
|
# even if we are using threading library. However, it averages out and gives a very good approximation.
|
97
|
-
def
|
100
|
+
def _update_resources(self):
|
98
101
|
cluster_resources = ray.cluster_resources()
|
99
102
|
available_resources = ray.available_resources()
|
100
103
|
if "CPU" not in cluster_resources:
|
@@ -104,14 +107,28 @@ class ClusterUtilizationOverTimeRange(AbstractContextManager):
|
|
104
107
|
self.used_vcpu_seconds = self.used_vcpu_seconds + float(
|
105
108
|
str(cluster_resources["CPU"] - available_resources["CPU"])
|
106
109
|
)
|
107
|
-
|
108
|
-
|
109
|
-
)
|
110
|
-
|
111
|
-
|
112
|
-
|
110
|
+
|
111
|
+
self.total_vcpu_seconds = self.total_vcpu_seconds + float(
|
112
|
+
str(cluster_resources["CPU"])
|
113
|
+
)
|
114
|
+
|
115
|
+
if "memory" not in cluster_resources:
|
116
|
+
return
|
117
|
+
|
118
|
+
if "memory" in available_resources:
|
119
|
+
self.used_memory_gb_seconds = (
|
120
|
+
self.used_memory_gb_seconds
|
121
|
+
+ float(
|
122
|
+
str(cluster_resources["memory"] - available_resources["memory"])
|
123
|
+
)
|
124
|
+
/ BYTES_PER_GIBIBYTE
|
113
125
|
)
|
114
126
|
|
127
|
+
self.total_memory_gb_seconds = (
|
128
|
+
self.total_memory_gb_seconds
|
129
|
+
+ float(str(cluster_resources["memory"])) / BYTES_PER_GIBIBYTE
|
130
|
+
)
|
131
|
+
|
115
132
|
def _run_schedule(self, interval: Optional[float] = 1.0):
|
116
133
|
cease_continuous_run = threading.Event()
|
117
134
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: deltacat
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18b19
|
4
4
|
Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
|
5
5
|
Home-page: https://github.com/ray-project/deltacat
|
6
6
|
Author: Ray Team
|
@@ -26,7 +26,7 @@ Requires-Dist: tenacity ==8.1.0
|
|
26
26
|
Requires-Dist: typing-extensions ==4.4.0
|
27
27
|
Requires-Dist: pymemcache ==4.0.0
|
28
28
|
Requires-Dist: redis ==4.6.0
|
29
|
-
Requires-Dist: getdaft ==0.1.
|
29
|
+
Requires-Dist: getdaft ==0.1.16
|
30
30
|
Requires-Dist: schedule ==1.2.0
|
31
31
|
|
32
32
|
# DeltaCAT
|
@@ -1,10 +1,10 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=0iDCTG5vkP_WA3KAWsoKCN1p_8ZONakFWjeoGeVspDU,1781
|
2
2
|
deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
|
3
3
|
deltacat/exceptions.py,sha256=xqZf8CwysNYP2d39pf27OnXGStPREgBgIM-e2Tts-TI,199
|
4
4
|
deltacat/logs.py,sha256=9XWuTBoWhhAF9rAL6t9veXmnAlJHsaqk0lTxteVPqyQ,5674
|
5
5
|
deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
deltacat/aws/clients.py,sha256=
|
7
|
-
deltacat/aws/constants.py,sha256=
|
6
|
+
deltacat/aws/clients.py,sha256=1_h6SLlTWGuy9ZaUEtv96F34zfS4e8Rs94MFhLjSd7c,6376
|
7
|
+
deltacat/aws/constants.py,sha256=01jrMFz3ZduGA0c7vz1wCHR7gBoNCMXXLi4JFYzZA9M,267
|
8
8
|
deltacat/aws/s3u.py,sha256=mdJrX9z5O8kh00jUL0w8CYBxKAemVYs26sRDzwSonfg,18390
|
9
9
|
deltacat/aws/redshift/__init__.py,sha256=fjuv3jWdPE8IgF4uSrL0YEqV3XUfqDULX3xV27ICceo,266
|
10
10
|
deltacat/aws/redshift/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -23,10 +23,10 @@ deltacat/compute/compactor/__init__.py,sha256=ivpOPve1yKi3Vz3tVgp-eeFMNEeUSf-dlR
|
|
23
23
|
deltacat/compute/compactor/compaction_session.py,sha256=aHCkhjcJ3kgRcDDJ6snSgmPts7nLvtm_oGTqoxA3-68,27408
|
24
24
|
deltacat/compute/compactor/repartition_session.py,sha256=f5BTTGNv365qSuTioL7QUuVm-px_l8-zz-OC_p7gXt4,7240
|
25
25
|
deltacat/compute/compactor/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
|
-
deltacat/compute/compactor/model/compact_partition_params.py,sha256=
|
27
|
-
deltacat/compute/compactor/model/compaction_session_audit_info.py,sha256=
|
26
|
+
deltacat/compute/compactor/model/compact_partition_params.py,sha256=ASPbmI9GMNG7Ho6XEJHyGDXbrHe-ytRvhjq_Bys2Oec,14283
|
27
|
+
deltacat/compute/compactor/model/compaction_session_audit_info.py,sha256=GxAbBxEPJvCJoWqCJlojldYRQO635P915wy9XT9jgKE,31034
|
28
28
|
deltacat/compute/compactor/model/dedupe_result.py,sha256=1OCV944qJdLQ_-8scisVKl45ej1eRv9OV539QYZtQ-U,292
|
29
|
-
deltacat/compute/compactor/model/delta_annotated.py,sha256=
|
29
|
+
deltacat/compute/compactor/model/delta_annotated.py,sha256=NERB9rOtYg-xzBwvqGJ7_hBOzBC7g6X5M9-Cq5pbdH8,12258
|
30
30
|
deltacat/compute/compactor/model/delta_file_envelope.py,sha256=et1KXJLwheEpzvy8vNjlYcgGavvwaNElZZYaCu7kyVA,2821
|
31
31
|
deltacat/compute/compactor/model/delta_file_locator.py,sha256=AmhPGPDsmahVhp91rohJMx4ByumcIY5feqRLZTrNu4s,1905
|
32
32
|
deltacat/compute/compactor/model/hash_bucket_result.py,sha256=71qGmaT1Mks-r3-aatjNbn2x3yWIgT8RmV0bRWe6pdA,275
|
@@ -47,25 +47,25 @@ deltacat/compute/compactor/utils/round_completion_file.py,sha256=DmZfHeAXlQn0DDd
|
|
47
47
|
deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZRG2NqBlCw1X3_FBc,2397
|
48
48
|
deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
|
49
49
|
deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
|
-
deltacat/compute/compactor_v2/compaction_session.py,sha256=
|
51
|
-
deltacat/compute/compactor_v2/constants.py,sha256=
|
50
|
+
deltacat/compute/compactor_v2/compaction_session.py,sha256=rxIyLaOuoGEpZk-UdtGAPurJA7oB9X-Vkvr7hgWTlV0,19234
|
51
|
+
deltacat/compute/compactor_v2/constants.py,sha256=skNkIOkvyfGm4z086ekln7niMwHmfMErhV9H0k8b1cc,1471
|
52
52
|
deltacat/compute/compactor_v2/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
53
|
deltacat/compute/compactor_v2/model/hash_bucket_input.py,sha256=pgE2o8Z9-Dvs75C15LAkmfuJFFi5pRIuuxA9GGyDlLM,2631
|
54
54
|
deltacat/compute/compactor_v2/model/hash_bucket_result.py,sha256=EsY9BPPywhmxlcLKn3kGWzAX4s4BTR2vYyPUB-wAEOc,309
|
55
55
|
deltacat/compute/compactor_v2/model/merge_input.py,sha256=A-_Oq54sx1vrT-Ewv2_yKARdIh928yJvEuheCkw5tvQ,5049
|
56
56
|
deltacat/compute/compactor_v2/model/merge_result.py,sha256=L53i9iL_XpzqBr7HETixD5v5qfLvitkGcjoML_hHfcs,368
|
57
57
|
deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
|
-
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=
|
58
|
+
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=NR-IDva6iB2YeNgxim_WsuZfEk5ooV8jAwzDJjdrsDE,7375
|
59
59
|
deltacat/compute/compactor_v2/steps/merge.py,sha256=hgQiY2ui49HN-7ByIQlXVUCRbyrG7Jr61kohyGh6abY,17258
|
60
60
|
deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
61
|
-
deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=
|
61
|
+
deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=rNKZisxGrLQOkwX8eHUQiFoTR1V-E66pMqWigtrs618,2156
|
62
62
|
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=A1xs5CU419h0nKv0B7R4tDkdgYAUIFQB_DWryRhpL98,1710
|
63
63
|
deltacat/compute/compactor_v2/utils/io.py,sha256=4KV13VKwEtIzkwPJLJmEnp1dMOKHSxkEOQNQVbYrcwY,5177
|
64
64
|
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=NNF-h4zKRegVluAtXSDW4YRdOd4xJ6z_6uDaxfJyBiw,11122
|
65
|
-
deltacat/compute/compactor_v2/utils/task_options.py,sha256=
|
65
|
+
deltacat/compute/compactor_v2/utils/task_options.py,sha256=jjN-NNkMQetbcFgJMhHqVBqNdWfXFrRglMaFYjTfeBo,7987
|
66
66
|
deltacat/compute/metastats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
67
67
|
deltacat/compute/metastats/meta_stats.py,sha256=78hN3aN5wLHUFJsZXuv2JLeqA35HZ8mLUWJDMslMj5Q,18731
|
68
|
-
deltacat/compute/metastats/stats.py,sha256=
|
68
|
+
deltacat/compute/metastats/stats.py,sha256=8iUiSXOAjqiEeNP5RIb5gvhykBgpNHD5IKkB8zsPR0E,7363
|
69
69
|
deltacat/compute/metastats/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
70
70
|
deltacat/compute/metastats/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
71
|
deltacat/compute/metastats/model/partition_stats_dict.py,sha256=FbfoOxmTZfjRT7iHwc_96gHmB_r6iUvVM9BoTldD5mY,1123
|
@@ -91,7 +91,7 @@ deltacat/compute/stats/utils/manifest_stats_file.py,sha256=PtqW5Zc5e09HcfiAgvoZH
|
|
91
91
|
deltacat/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
92
92
|
deltacat/io/dataset.py,sha256=8w9sPVDpGnjjGVDWB39YSKWxq4zRv9VEfDtj7PYwjqM,3755
|
93
93
|
deltacat/io/file_object_store.py,sha256=HCFeXu9cWXPXVk54MHel_nw3-wIuzhMt2RI6jKzjRYM,1346
|
94
|
-
deltacat/io/memcached_object_store.py,sha256=
|
94
|
+
deltacat/io/memcached_object_store.py,sha256=eA0Ggk6BedFcyT_lbgScjRj2TSB2p45YwAs0B-qJEt0,5104
|
95
95
|
deltacat/io/object_store.py,sha256=GX4pK-LY92s3uXRGcj8YsG2FFoiKfcJr2USIVz1ruGg,1380
|
96
96
|
deltacat/io/ray_plasma_object_store.py,sha256=pupw7ulZY_EV5dERJDCCW_y_hzVx3Hl_uAvpQTNIh-E,705
|
97
97
|
deltacat/io/read_api.py,sha256=BhkjL3xjY-fsa62AA9Yv20_88uTskn4_Bv2W6VmMXVA,7023
|
@@ -143,9 +143,9 @@ deltacat/tests/test_utils/pyarrow.py,sha256=EZk2Mtqiiu7Z79Lqm-hyHWbH6c-lbYnpvCn3
|
|
143
143
|
deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
|
144
144
|
deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
145
145
|
deltacat/tests/utils/test_daft.py,sha256=dfg4PYs6W4buBwj0FakTF2i7uFF6G4nj_48Dc8R11HQ,2852
|
146
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
146
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=QspOrQRc6tLM52-taHYgw5v_49qKgvdgW7E4eNSv5Mk,15708
|
147
147
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
148
|
-
deltacat/tests/utils/test_resources.py,sha256=
|
148
|
+
deltacat/tests/utils/test_resources.py,sha256=NMiJl9wlKNnK-edBaftN2CWxDhFmR2AGsseNsg8n-zg,1856
|
149
149
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
150
150
|
deltacat/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
151
151
|
deltacat/types/media.py,sha256=RALwafQ0SwMyPUIcENhURk7Sor_2CIfEMztvFUnvZFQ,2227
|
@@ -160,8 +160,8 @@ deltacat/utils/numpy.py,sha256=ZiGREobTVT6IZXgPxkSUpLJFN2Hn8KEZcrqybLDXCIA,2027
|
|
160
160
|
deltacat/utils/pandas.py,sha256=eGOpiZE1zLznTtuwoN80j4PBp1_bUV8SE4c951r0a3o,9561
|
161
161
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
162
162
|
deltacat/utils/placement.py,sha256=S80CwD1eEK47lQNr0xTmF9kq092-z6lTTmOOBv8cW_o,11723
|
163
|
-
deltacat/utils/pyarrow.py,sha256=
|
164
|
-
deltacat/utils/resources.py,sha256=
|
163
|
+
deltacat/utils/pyarrow.py,sha256=Bm_B-gtI6enhlgDuzpSiaNUc20p1rHS-3Y6OdHySl0E,26352
|
164
|
+
deltacat/utils/resources.py,sha256=2nrYzDm6BPedrpsWM_4NxCZZ4cubVYitO1Myev0B5W0,6044
|
165
165
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
166
166
|
deltacat/utils/ray_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
167
167
|
deltacat/utils/ray_utils/collections.py,sha256=hj20s4D2RF2jZETU_44r6mFbsczA0JI_I_4kWKTmqes,1951
|
@@ -169,8 +169,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=MlpOHlKgJKSXzLsSR8mg4V_dWSVP_udrl
|
|
169
169
|
deltacat/utils/ray_utils/dataset.py,sha256=SIljK3UkSqQ6Ntit_iSiYt9yYjN_gGrCTX6_72XdQ3w,3244
|
170
170
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
171
171
|
deltacat/utils/ray_utils/runtime.py,sha256=xOVkqL6o8qGsewGvzhMKxmCcqcFZDnNILuz5IGMgxSc,4991
|
172
|
-
deltacat-0.1.
|
173
|
-
deltacat-0.1.
|
174
|
-
deltacat-0.1.
|
175
|
-
deltacat-0.1.
|
176
|
-
deltacat-0.1.
|
172
|
+
deltacat-0.1.18b19.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
173
|
+
deltacat-0.1.18b19.dist-info/METADATA,sha256=117_kv8NZ-J8NXxHG2F_Msgm_XvyjUJ_f2oEXJZ2PbU,1740
|
174
|
+
deltacat-0.1.18b19.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
175
|
+
deltacat-0.1.18b19.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
176
|
+
deltacat-0.1.18b19.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|