deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +3 -2
- deltacat/aws/clients.py +123 -3
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
- deltacat/benchmarking/conftest.py +61 -0
- deltacat/catalog/delegate.py +1 -1
- deltacat/catalog/interface.py +1 -1
- deltacat/compute/compactor/__init__.py +0 -3
- deltacat/compute/compactor/compaction_session.py +45 -20
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
- deltacat/compute/compactor/model/primary_key_index.py +1 -1
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +5 -3
- deltacat/compute/compactor/steps/dedupe.py +10 -8
- deltacat/compute/compactor/steps/hash_bucket.py +25 -4
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/primary_key_index.py +1 -15
- deltacat/compute/compactor/utils/sort_key.py +57 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/__init__.py +0 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +11 -2
- deltacat/storage/model/sort_key.py +33 -0
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/__init__.py +0 -0
- deltacat/tests/aws/test_clients.py +80 -0
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/performance.py +4 -2
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
- deltacat-0.1.18b15.dist-info/RECORD +176 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/model/sort_key.py +0 -98
- deltacat-0.1.18b13.dist-info/RECORD +0 -136
- /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
deltacat/utils/pyarrow.py
CHANGED
@@ -7,6 +7,8 @@ import io
|
|
7
7
|
import logging
|
8
8
|
from functools import partial
|
9
9
|
from typing import Any, Callable, Dict, Iterable, List, Optional
|
10
|
+
from pyarrow.parquet import ParquetFile
|
11
|
+
from deltacat.exceptions import ValidationError
|
10
12
|
|
11
13
|
import pyarrow as pa
|
12
14
|
from fsspec import AbstractFileSystem
|
@@ -15,6 +17,7 @@ from pyarrow import feather as paf
|
|
15
17
|
from pyarrow import json as pajson
|
16
18
|
from pyarrow import parquet as papq
|
17
19
|
from ray.data.datasource import BlockWritePathProvider
|
20
|
+
from deltacat.utils.s3fs import create_s3_file_system
|
18
21
|
|
19
22
|
from deltacat import logs
|
20
23
|
from deltacat.types.media import (
|
@@ -23,12 +26,17 @@ from deltacat.types.media import (
|
|
23
26
|
ContentEncoding,
|
24
27
|
ContentType,
|
25
28
|
)
|
29
|
+
from deltacat.types.partial_download import (
|
30
|
+
PartialFileDownloadParams,
|
31
|
+
PartialParquetParameters,
|
32
|
+
)
|
26
33
|
from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
|
27
34
|
from deltacat.utils.performance import timed_invocation
|
35
|
+
from deltacat.utils.daft import daft_s3_file_to_table
|
36
|
+
from deltacat.utils.arguments import sanitize_kwargs_to_callable
|
28
37
|
|
29
38
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
30
39
|
|
31
|
-
|
32
40
|
CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
|
33
41
|
ContentType.UNESCAPED_TSV.value: pacsv.read_csv,
|
34
42
|
ContentType.TSV.value: pacsv.read_csv,
|
@@ -170,6 +178,7 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
|
|
170
178
|
self,
|
171
179
|
schema: Optional[pa.Schema] = None,
|
172
180
|
pq_coerce_int96_timestamp_unit: Optional[str] = None,
|
181
|
+
parquet_reader_type: Optional[str] = None,
|
173
182
|
):
|
174
183
|
"""
|
175
184
|
|
@@ -182,6 +191,7 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
|
|
182
191
|
"""
|
183
192
|
self.schema = schema
|
184
193
|
self.pq_coerce_int96_timestamp_unit = pq_coerce_int96_timestamp_unit
|
194
|
+
self.parquet_reader_type = parquet_reader_type
|
185
195
|
|
186
196
|
def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
187
197
|
if content_type in DELIMITED_TEXT_CONTENT_TYPES:
|
@@ -201,6 +211,11 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
|
|
201
211
|
"coerce_int96_timestamp_unit"
|
202
212
|
] = self.pq_coerce_int96_timestamp_unit
|
203
213
|
|
214
|
+
if self.parquet_reader_type:
|
215
|
+
kwargs["reader_type"] = self.parquet_reader_type
|
216
|
+
else:
|
217
|
+
kwargs["reader_type"] = "daft"
|
218
|
+
|
204
219
|
return kwargs
|
205
220
|
|
206
221
|
|
@@ -237,6 +252,118 @@ def _add_column_kwargs(
|
|
237
252
|
)
|
238
253
|
|
239
254
|
|
255
|
+
def _get_compatible_target_schema(
|
256
|
+
table_schema: pa.Schema, input_schema: pa.Schema
|
257
|
+
) -> pa.Schema:
|
258
|
+
target_schema_fields = []
|
259
|
+
|
260
|
+
for field in table_schema:
|
261
|
+
index = input_schema.get_field_index(field.name)
|
262
|
+
|
263
|
+
if index != -1:
|
264
|
+
target_field = input_schema.field(index)
|
265
|
+
target_schema_fields.append(target_field)
|
266
|
+
else:
|
267
|
+
target_schema_fields.append(field)
|
268
|
+
|
269
|
+
target_schema = pa.schema(target_schema_fields, metadata=table_schema.metadata)
|
270
|
+
|
271
|
+
return target_schema
|
272
|
+
|
273
|
+
|
274
|
+
def s3_partial_parquet_file_to_table(
|
275
|
+
s3_url: str,
|
276
|
+
content_type: str,
|
277
|
+
content_encoding: str,
|
278
|
+
column_names: Optional[List[str]] = None,
|
279
|
+
include_columns: Optional[List[str]] = None,
|
280
|
+
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
281
|
+
partial_file_download_params: Optional[PartialParquetParameters] = None,
|
282
|
+
**s3_client_kwargs,
|
283
|
+
) -> pa.Table:
|
284
|
+
|
285
|
+
assert (
|
286
|
+
partial_file_download_params is not None
|
287
|
+
), "Partial parquet params must not be None"
|
288
|
+
assert (
|
289
|
+
partial_file_download_params.row_groups_to_download is not None
|
290
|
+
), "No row groups to download"
|
291
|
+
|
292
|
+
pq_file = s3_file_to_parquet(
|
293
|
+
s3_url=s3_url,
|
294
|
+
content_type=content_type,
|
295
|
+
content_encoding=content_encoding,
|
296
|
+
partial_file_download_params=partial_file_download_params,
|
297
|
+
**s3_client_kwargs,
|
298
|
+
)
|
299
|
+
|
300
|
+
table, latency = timed_invocation(
|
301
|
+
pq_file.read_row_groups,
|
302
|
+
partial_file_download_params.row_groups_to_download,
|
303
|
+
columns=include_columns or column_names,
|
304
|
+
)
|
305
|
+
|
306
|
+
logger.debug(f"Successfully read from s3_url={s3_url} in {latency}s")
|
307
|
+
|
308
|
+
kwargs = {}
|
309
|
+
|
310
|
+
if pa_read_func_kwargs_provider:
|
311
|
+
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
312
|
+
|
313
|
+
# Note: ordering is not guaranteed.
|
314
|
+
if kwargs.get("schema") is not None:
|
315
|
+
input_schema = kwargs.get("schema")
|
316
|
+
table_schema = table.schema
|
317
|
+
|
318
|
+
target_schema = _get_compatible_target_schema(table_schema, input_schema)
|
319
|
+
casted_table = table.cast(target_schema)
|
320
|
+
|
321
|
+
return casted_table
|
322
|
+
|
323
|
+
return table
|
324
|
+
|
325
|
+
|
326
|
+
def s3_parquet_file_to_table(
|
327
|
+
s3_url: str,
|
328
|
+
content_type: str,
|
329
|
+
content_encoding: str,
|
330
|
+
column_names: Optional[List[str]] = None,
|
331
|
+
include_columns: Optional[List[str]] = None,
|
332
|
+
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
333
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
334
|
+
**s3_client_kwargs,
|
335
|
+
) -> pa.Table:
|
336
|
+
|
337
|
+
logger.debug(
|
338
|
+
f"Reading to Parquet table using read_table for {content_type} "
|
339
|
+
f"and encoding: {content_encoding}"
|
340
|
+
)
|
341
|
+
|
342
|
+
if s3_client_kwargs is None:
|
343
|
+
s3_client_kwargs = {}
|
344
|
+
|
345
|
+
kwargs = {}
|
346
|
+
|
347
|
+
if s3_url.startswith("s3://"):
|
348
|
+
s3_file_system = create_s3_file_system(s3_client_kwargs)
|
349
|
+
kwargs["filesystem"] = s3_file_system
|
350
|
+
|
351
|
+
_add_column_kwargs(
|
352
|
+
content_type=content_type,
|
353
|
+
column_names=column_names,
|
354
|
+
include_columns=include_columns,
|
355
|
+
kwargs=kwargs,
|
356
|
+
)
|
357
|
+
|
358
|
+
if pa_read_func_kwargs_provider:
|
359
|
+
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
360
|
+
|
361
|
+
table, latency = timed_invocation(papq.read_table, s3_url, **kwargs)
|
362
|
+
|
363
|
+
logger.debug(f"Successfully read the table from url={s3_url} in {latency}s")
|
364
|
+
return table
|
365
|
+
|
366
|
+
|
240
367
|
def s3_file_to_table(
|
241
368
|
s3_url: str,
|
242
369
|
content_type: str,
|
@@ -244,6 +371,7 @@ def s3_file_to_table(
|
|
244
371
|
column_names: Optional[List[str]] = None,
|
245
372
|
include_columns: Optional[List[str]] = None,
|
246
373
|
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
374
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
247
375
|
**s3_client_kwargs,
|
248
376
|
) -> pa.Table:
|
249
377
|
|
@@ -253,6 +381,39 @@ def s3_file_to_table(
|
|
253
381
|
f"Reading {s3_url} to PyArrow. Content type: {content_type}. "
|
254
382
|
f"Encoding: {content_encoding}"
|
255
383
|
)
|
384
|
+
|
385
|
+
if (
|
386
|
+
content_type == ContentType.PARQUET.value
|
387
|
+
and content_encoding == ContentEncoding.IDENTITY.value
|
388
|
+
):
|
389
|
+
logger.debug(
|
390
|
+
f"Performing read using parquet reader for encoding={content_encoding} "
|
391
|
+
f"and content_type={content_type}"
|
392
|
+
)
|
393
|
+
kwargs = {}
|
394
|
+
if pa_read_func_kwargs_provider is not None:
|
395
|
+
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
396
|
+
|
397
|
+
if kwargs.get("reader_type", "daft") == "daft":
|
398
|
+
parquet_reader_func = daft_s3_file_to_table
|
399
|
+
elif partial_file_download_params and isinstance(
|
400
|
+
partial_file_download_params, PartialParquetParameters
|
401
|
+
):
|
402
|
+
parquet_reader_func = s3_partial_parquet_file_to_table
|
403
|
+
else:
|
404
|
+
parquet_reader_func = s3_parquet_file_to_table
|
405
|
+
|
406
|
+
return parquet_reader_func(
|
407
|
+
s3_url=s3_url,
|
408
|
+
content_type=content_type,
|
409
|
+
content_encoding=content_encoding,
|
410
|
+
column_names=column_names,
|
411
|
+
include_columns=include_columns,
|
412
|
+
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
413
|
+
partial_file_download_params=partial_file_download_params,
|
414
|
+
**s3_client_kwargs,
|
415
|
+
)
|
416
|
+
|
256
417
|
s3_obj = s3_utils.get_object_at_url(s3_url, **s3_client_kwargs)
|
257
418
|
logger.debug(f"Read S3 object from {s3_url}: {s3_obj}")
|
258
419
|
pa_read_func = CONTENT_TYPE_TO_PA_READ_FUNC[content_type]
|
@@ -272,6 +433,57 @@ def s3_file_to_table(
|
|
272
433
|
return table
|
273
434
|
|
274
435
|
|
436
|
+
def s3_file_to_parquet(
|
437
|
+
s3_url: str,
|
438
|
+
content_type: str,
|
439
|
+
content_encoding: str,
|
440
|
+
column_names: Optional[List[str]] = None,
|
441
|
+
include_columns: Optional[List[str]] = None,
|
442
|
+
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
443
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
444
|
+
**s3_client_kwargs,
|
445
|
+
) -> ParquetFile:
|
446
|
+
logger.debug(
|
447
|
+
f"Reading {s3_url} to PyArrow ParquetFile. "
|
448
|
+
f"Content type: {content_type}. Encoding: {content_encoding}"
|
449
|
+
)
|
450
|
+
|
451
|
+
if (
|
452
|
+
content_type != ContentType.PARQUET.value
|
453
|
+
or content_encoding != ContentEncoding.IDENTITY
|
454
|
+
):
|
455
|
+
raise ValidationError(
|
456
|
+
f"S3 file with content type: {content_type} and "
|
457
|
+
f"content encoding: {content_encoding} cannot be read"
|
458
|
+
"into pyarrow.parquet.ParquetFile"
|
459
|
+
)
|
460
|
+
|
461
|
+
if s3_client_kwargs is None:
|
462
|
+
s3_client_kwargs = {}
|
463
|
+
|
464
|
+
kwargs = {}
|
465
|
+
|
466
|
+
if s3_url.startswith("s3://"):
|
467
|
+
s3_file_system = create_s3_file_system(s3_client_kwargs)
|
468
|
+
kwargs["filesystem"] = s3_file_system
|
469
|
+
|
470
|
+
if pa_read_func_kwargs_provider:
|
471
|
+
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
472
|
+
|
473
|
+
logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
|
474
|
+
|
475
|
+
kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
|
476
|
+
|
477
|
+
logger.debug(
|
478
|
+
f"Reading the file from {s3_url} into ParquetFile with kwargs: {kwargs}"
|
479
|
+
)
|
480
|
+
pqFile, latency = timed_invocation(ParquetFile, s3_url, **kwargs)
|
481
|
+
|
482
|
+
logger.debug(f"Time to get {s3_url} into parquet file: {latency}s")
|
483
|
+
|
484
|
+
return pqFile
|
485
|
+
|
486
|
+
|
275
487
|
def table_size(table: pa.Table) -> int:
|
276
488
|
return table.nbytes
|
277
489
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import copy
|
2
2
|
import itertools
|
3
3
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
4
|
-
|
4
|
+
from deltacat.utils.placement import PlacementGroupConfig
|
5
5
|
import ray
|
6
6
|
from ray._private.ray_constants import MIN_RESOURCE_GRANULARITY
|
7
7
|
from ray.types import ObjectRef
|
@@ -115,3 +115,28 @@ def round_robin_options_provider(
|
|
115
115
|
resource_key_index = i % len(resource_keys)
|
116
116
|
key = resource_keys[resource_key_index]
|
117
117
|
return {"resources": {key: resource_amount_provider(resource_key_index)}}
|
118
|
+
|
119
|
+
|
120
|
+
def task_resource_options_provider(
|
121
|
+
i: int,
|
122
|
+
item: Any,
|
123
|
+
resource_amount_provider: Callable[[int, Any], Dict] = lambda x: {},
|
124
|
+
pg_config: Optional[PlacementGroupConfig] = None,
|
125
|
+
**kwargs,
|
126
|
+
) -> Dict:
|
127
|
+
"""
|
128
|
+
Return options that needs to be provided to each task.
|
129
|
+
"""
|
130
|
+
|
131
|
+
options = resource_amount_provider(i, item, **kwargs)
|
132
|
+
if pg_config:
|
133
|
+
options_to_append = copy.deepcopy(pg_config.opts)
|
134
|
+
bundle_key_index = i % len(
|
135
|
+
options_to_append["scheduling_strategy"].placement_group.bundle_specs
|
136
|
+
)
|
137
|
+
options_to_append[
|
138
|
+
"scheduling_strategy"
|
139
|
+
].placement_group_bundle_index = bundle_key_index
|
140
|
+
options = {**options, **options_to_append}
|
141
|
+
|
142
|
+
return options
|
deltacat/utils/resources.py
CHANGED
@@ -1,15 +1,20 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
from contextlib import AbstractContextManager
|
5
|
+
from types import TracebackType
|
4
6
|
import ray
|
5
7
|
import sys
|
6
|
-
|
8
|
+
import threading
|
9
|
+
import time
|
10
|
+
from typing import Dict, Any, Optional
|
7
11
|
from dataclasses import dataclass
|
8
12
|
from deltacat import logs
|
9
13
|
import logging
|
10
14
|
from resource import getrusage, RUSAGE_SELF
|
11
15
|
import platform
|
12
16
|
import psutil
|
17
|
+
import schedule
|
13
18
|
|
14
19
|
|
15
20
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -56,6 +61,72 @@ class ClusterUtilization:
|
|
56
61
|
)
|
57
62
|
|
58
63
|
|
64
|
+
class ClusterUtilizationOverTimeRange(AbstractContextManager):
|
65
|
+
"""
|
66
|
+
This class can be used to compute the cluster utilization metrics
|
67
|
+
which requires us to compute it over time as they change on-demand.
|
68
|
+
|
69
|
+
For example, in an autoscaling cluster, the vCPUs keep changing and hence
|
70
|
+
more important metrics to capture in that scenario is vcpu-seconds.
|
71
|
+
"""
|
72
|
+
|
73
|
+
def __init__(self) -> None:
|
74
|
+
self.total_vcpu_seconds = 0.0
|
75
|
+
self.used_vcpu_seconds = 0.0
|
76
|
+
|
77
|
+
def __enter__(self) -> Any:
|
78
|
+
schedule.every().second.do(self._update_vcpus)
|
79
|
+
self.stop_run_schedules = self._run_schedule()
|
80
|
+
return super().__enter__()
|
81
|
+
|
82
|
+
def __exit__(
|
83
|
+
self,
|
84
|
+
__exc_type: type[BaseException] | None,
|
85
|
+
__exc_value: BaseException | None,
|
86
|
+
__traceback: TracebackType | None,
|
87
|
+
) -> bool | None:
|
88
|
+
if __exc_value:
|
89
|
+
logger.error(
|
90
|
+
f"Error ocurred while calculating cluster resources: {__exc_value}"
|
91
|
+
)
|
92
|
+
self.stop_run_schedules.set()
|
93
|
+
return super().__exit__(__exc_type, __exc_value, __traceback)
|
94
|
+
|
95
|
+
# It is not truely parallel(due to GIL Ref: https://wiki.python.org/moin/GlobalInterpreterLock)
|
96
|
+
# even if we are using threading library. However, it averages out and gives a very good approximation.
|
97
|
+
def _update_vcpus(self):
|
98
|
+
cluster_resources = ray.cluster_resources()
|
99
|
+
available_resources = ray.available_resources()
|
100
|
+
if "CPU" not in cluster_resources:
|
101
|
+
return
|
102
|
+
|
103
|
+
if "CPU" in available_resources:
|
104
|
+
self.used_vcpu_seconds = self.used_vcpu_seconds + float(
|
105
|
+
str(cluster_resources["CPU"] - available_resources["CPU"])
|
106
|
+
)
|
107
|
+
self.total_vcpu_seconds = self.total_vcpu_seconds + float(
|
108
|
+
str(cluster_resources["CPU"])
|
109
|
+
)
|
110
|
+
else:
|
111
|
+
self.total_vcpu_seconds = self.total_vcpu_seconds + float(
|
112
|
+
str(cluster_resources["CPU"])
|
113
|
+
)
|
114
|
+
|
115
|
+
def _run_schedule(self, interval: Optional[float] = 1.0):
|
116
|
+
cease_continuous_run = threading.Event()
|
117
|
+
|
118
|
+
class ScheduleThread(threading.Thread):
|
119
|
+
@classmethod
|
120
|
+
def run(cls):
|
121
|
+
while not cease_continuous_run.is_set():
|
122
|
+
schedule.run_pending()
|
123
|
+
time.sleep(float(str(interval)))
|
124
|
+
|
125
|
+
continuous_thread = ScheduleThread()
|
126
|
+
continuous_thread.start()
|
127
|
+
return cease_continuous_run
|
128
|
+
|
129
|
+
|
59
130
|
def get_current_node_peak_memory_usage_in_bytes():
|
60
131
|
"""
|
61
132
|
Returns the peak memory usage of the node in bytes. This method works across
|
deltacat/utils/s3fs.py
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
import s3fs
|
2
|
+
|
3
|
+
|
4
|
+
def create_s3_file_system(s3_client_kwargs: dict) -> s3fs.S3FileSystem:
|
5
|
+
if not s3_client_kwargs:
|
6
|
+
return s3fs.S3FileSystem(anon=True)
|
7
|
+
|
8
|
+
config_kwargs = {}
|
9
|
+
if s3_client_kwargs.get("config") is not None:
|
10
|
+
boto_config = s3_client_kwargs.pop("config")
|
11
|
+
for key, val in boto_config.__dict__.items():
|
12
|
+
if not key.startswith("_") and val is not None:
|
13
|
+
config_kwargs[key] = val
|
14
|
+
|
15
|
+
anon = False
|
16
|
+
if s3_client_kwargs.get("aws_access_key_id") is None:
|
17
|
+
anon = True
|
18
|
+
|
19
|
+
return s3fs.S3FileSystem(
|
20
|
+
anon=anon, client_kwargs=s3_client_kwargs, config_kwargs=config_kwargs or None
|
21
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: deltacat
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18b15
|
4
4
|
Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
|
5
5
|
Home-page: https://github.com/ray-project/deltacat
|
6
6
|
Author: Ray Team
|
@@ -15,17 +15,19 @@ Classifier: Programming Language :: Python :: 3.9
|
|
15
15
|
Classifier: Operating System :: OS Independent
|
16
16
|
Requires-Python: >=3.7
|
17
17
|
Description-Content-Type: text/markdown
|
18
|
-
Requires-Dist: boto3
|
19
|
-
Requires-Dist: numpy
|
20
|
-
Requires-Dist: pandas
|
21
|
-
Requires-Dist: pyarrow
|
22
|
-
Requires-Dist: pydantic
|
23
|
-
Requires-Dist: ray[default]
|
24
|
-
Requires-Dist: s3fs
|
25
|
-
Requires-Dist: tenacity
|
26
|
-
Requires-Dist: typing-extensions
|
27
|
-
Requires-Dist: pymemcache
|
28
|
-
Requires-Dist: redis
|
18
|
+
Requires-Dist: boto3 ~=1.20
|
19
|
+
Requires-Dist: numpy ==1.21.5
|
20
|
+
Requires-Dist: pandas ==1.3.5
|
21
|
+
Requires-Dist: pyarrow ==12.0.1
|
22
|
+
Requires-Dist: pydantic ==1.10.4
|
23
|
+
Requires-Dist: ray[default] ~=2.0
|
24
|
+
Requires-Dist: s3fs ==2022.2.0
|
25
|
+
Requires-Dist: tenacity ==8.1.0
|
26
|
+
Requires-Dist: typing-extensions ==4.4.0
|
27
|
+
Requires-Dist: pymemcache ==4.0.0
|
28
|
+
Requires-Dist: redis ==4.6.0
|
29
|
+
Requires-Dist: getdaft ==0.1.15
|
30
|
+
Requires-Dist: schedule ==1.2.0
|
29
31
|
|
30
32
|
# DeltaCAT
|
31
33
|
|
@@ -40,10 +42,22 @@ for common table management tasks, including petabyte-scale
|
|
40
42
|
change-data-capture, data consistency checks, and table repair.
|
41
43
|
|
42
44
|
## Getting Started
|
43
|
-
|
45
|
+
|
44
46
|
### Install
|
47
|
+
|
45
48
|
```
|
46
49
|
pip install deltacat
|
47
50
|
```
|
48
51
|
|
52
|
+
### Running Tests
|
53
|
+
|
54
|
+
```
|
55
|
+
pip3 install virtualenv
|
56
|
+
virtualenv test_env
|
57
|
+
source test_env/bin/activate
|
58
|
+
pip3 install -r requirements.txt
|
59
|
+
|
60
|
+
pytest
|
61
|
+
```
|
62
|
+
|
49
63
|
|