deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. deltacat/__init__.py +3 -2
  2. deltacat/aws/clients.py +123 -3
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
  6. deltacat/benchmarking/conftest.py +61 -0
  7. deltacat/catalog/delegate.py +1 -1
  8. deltacat/catalog/interface.py +1 -1
  9. deltacat/compute/compactor/__init__.py +0 -3
  10. deltacat/compute/compactor/compaction_session.py +45 -20
  11. deltacat/compute/compactor/model/compact_partition_params.py +287 -58
  12. deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
  13. deltacat/compute/compactor/model/delta_annotated.py +91 -9
  14. deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
  15. deltacat/compute/compactor/model/primary_key_index.py +1 -1
  16. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  17. deltacat/compute/compactor/repartition_session.py +5 -3
  18. deltacat/compute/compactor/steps/dedupe.py +10 -8
  19. deltacat/compute/compactor/steps/hash_bucket.py +25 -4
  20. deltacat/compute/compactor/steps/materialize.py +11 -6
  21. deltacat/compute/compactor/steps/repartition.py +16 -1
  22. deltacat/compute/compactor/utils/io.py +40 -23
  23. deltacat/compute/compactor/utils/primary_key_index.py +1 -15
  24. deltacat/compute/compactor/utils/sort_key.py +57 -0
  25. deltacat/compute/compactor/utils/system_columns.py +43 -0
  26. deltacat/compute/compactor_v2/compaction_session.py +506 -0
  27. deltacat/compute/compactor_v2/constants.py +34 -0
  28. deltacat/compute/compactor_v2/model/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  30. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  31. deltacat/compute/compactor_v2/model/merge_input.py +127 -0
  32. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  33. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  34. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  35. deltacat/compute/compactor_v2/steps/merge.py +41 -0
  36. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  37. deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
  38. deltacat/compute/compactor_v2/utils/io.py +149 -0
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
  40. deltacat/compute/compactor_v2/utils/task_options.py +228 -0
  41. deltacat/compute/metastats/meta_stats.py +4 -2
  42. deltacat/compute/metastats/stats.py +1 -0
  43. deltacat/compute/metastats/utils/io.py +4 -0
  44. deltacat/compute/stats/utils/io.py +20 -5
  45. deltacat/exceptions.py +4 -0
  46. deltacat/io/memcached_object_store.py +37 -14
  47. deltacat/logs.py +4 -3
  48. deltacat/storage/__init__.py +3 -0
  49. deltacat/storage/interface.py +11 -2
  50. deltacat/storage/model/sort_key.py +33 -0
  51. deltacat/storage/model/table_version.py +11 -0
  52. deltacat/storage/model/types.py +2 -1
  53. deltacat/tests/aws/__init__.py +0 -0
  54. deltacat/tests/aws/test_clients.py +80 -0
  55. deltacat/tests/compute/__init__.py +0 -0
  56. deltacat/tests/compute/common.py +96 -0
  57. deltacat/tests/compute/compactor/__init__.py +0 -0
  58. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  59. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
  60. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  61. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  62. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  63. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  64. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
  65. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  66. deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
  67. deltacat/tests/compute/testcases.py +390 -0
  68. deltacat/tests/io/test_memcached_object_store.py +5 -4
  69. deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
  70. deltacat/tests/test_utils/pyarrow.py +32 -0
  71. deltacat/tests/test_utils/utils.py +13 -0
  72. deltacat/tests/utils/data/__init__.py +0 -0
  73. deltacat/tests/utils/test_daft.py +76 -0
  74. deltacat/tests/utils/test_pyarrow.py +133 -0
  75. deltacat/tests/utils/test_resources.py +23 -20
  76. deltacat/types/media.py +1 -0
  77. deltacat/types/partial_download.py +82 -0
  78. deltacat/types/tables.py +1 -0
  79. deltacat/utils/arguments.py +26 -0
  80. deltacat/utils/daft.py +87 -0
  81. deltacat/utils/performance.py +4 -2
  82. deltacat/utils/placement.py +20 -3
  83. deltacat/utils/pyarrow.py +213 -1
  84. deltacat/utils/ray_utils/concurrency.py +26 -1
  85. deltacat/utils/resources.py +72 -1
  86. deltacat/utils/s3fs.py +21 -0
  87. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
  88. deltacat-0.1.18b15.dist-info/RECORD +176 -0
  89. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
  90. deltacat/compute/compactor/model/sort_key.py +0 -98
  91. deltacat-0.1.18b13.dist-info/RECORD +0 -136
  92. /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
  93. /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
  94. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
  95. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
deltacat/utils/pyarrow.py CHANGED
@@ -7,6 +7,8 @@ import io
7
7
  import logging
8
8
  from functools import partial
9
9
  from typing import Any, Callable, Dict, Iterable, List, Optional
10
+ from pyarrow.parquet import ParquetFile
11
+ from deltacat.exceptions import ValidationError
10
12
 
11
13
  import pyarrow as pa
12
14
  from fsspec import AbstractFileSystem
@@ -15,6 +17,7 @@ from pyarrow import feather as paf
15
17
  from pyarrow import json as pajson
16
18
  from pyarrow import parquet as papq
17
19
  from ray.data.datasource import BlockWritePathProvider
20
+ from deltacat.utils.s3fs import create_s3_file_system
18
21
 
19
22
  from deltacat import logs
20
23
  from deltacat.types.media import (
@@ -23,12 +26,17 @@ from deltacat.types.media import (
23
26
  ContentEncoding,
24
27
  ContentType,
25
28
  )
29
+ from deltacat.types.partial_download import (
30
+ PartialFileDownloadParams,
31
+ PartialParquetParameters,
32
+ )
26
33
  from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
27
34
  from deltacat.utils.performance import timed_invocation
35
+ from deltacat.utils.daft import daft_s3_file_to_table
36
+ from deltacat.utils.arguments import sanitize_kwargs_to_callable
28
37
 
29
38
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
30
39
 
31
-
32
40
  CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
33
41
  ContentType.UNESCAPED_TSV.value: pacsv.read_csv,
34
42
  ContentType.TSV.value: pacsv.read_csv,
@@ -170,6 +178,7 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
170
178
  self,
171
179
  schema: Optional[pa.Schema] = None,
172
180
  pq_coerce_int96_timestamp_unit: Optional[str] = None,
181
+ parquet_reader_type: Optional[str] = None,
173
182
  ):
174
183
  """
175
184
 
@@ -182,6 +191,7 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
182
191
  """
183
192
  self.schema = schema
184
193
  self.pq_coerce_int96_timestamp_unit = pq_coerce_int96_timestamp_unit
194
+ self.parquet_reader_type = parquet_reader_type
185
195
 
186
196
  def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
187
197
  if content_type in DELIMITED_TEXT_CONTENT_TYPES:
@@ -201,6 +211,11 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
201
211
  "coerce_int96_timestamp_unit"
202
212
  ] = self.pq_coerce_int96_timestamp_unit
203
213
 
214
+ if self.parquet_reader_type:
215
+ kwargs["reader_type"] = self.parquet_reader_type
216
+ else:
217
+ kwargs["reader_type"] = "daft"
218
+
204
219
  return kwargs
205
220
 
206
221
 
@@ -237,6 +252,118 @@ def _add_column_kwargs(
237
252
  )
238
253
 
239
254
 
255
+ def _get_compatible_target_schema(
256
+ table_schema: pa.Schema, input_schema: pa.Schema
257
+ ) -> pa.Schema:
258
+ target_schema_fields = []
259
+
260
+ for field in table_schema:
261
+ index = input_schema.get_field_index(field.name)
262
+
263
+ if index != -1:
264
+ target_field = input_schema.field(index)
265
+ target_schema_fields.append(target_field)
266
+ else:
267
+ target_schema_fields.append(field)
268
+
269
+ target_schema = pa.schema(target_schema_fields, metadata=table_schema.metadata)
270
+
271
+ return target_schema
272
+
273
+
274
+ def s3_partial_parquet_file_to_table(
275
+ s3_url: str,
276
+ content_type: str,
277
+ content_encoding: str,
278
+ column_names: Optional[List[str]] = None,
279
+ include_columns: Optional[List[str]] = None,
280
+ pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
281
+ partial_file_download_params: Optional[PartialParquetParameters] = None,
282
+ **s3_client_kwargs,
283
+ ) -> pa.Table:
284
+
285
+ assert (
286
+ partial_file_download_params is not None
287
+ ), "Partial parquet params must not be None"
288
+ assert (
289
+ partial_file_download_params.row_groups_to_download is not None
290
+ ), "No row groups to download"
291
+
292
+ pq_file = s3_file_to_parquet(
293
+ s3_url=s3_url,
294
+ content_type=content_type,
295
+ content_encoding=content_encoding,
296
+ partial_file_download_params=partial_file_download_params,
297
+ **s3_client_kwargs,
298
+ )
299
+
300
+ table, latency = timed_invocation(
301
+ pq_file.read_row_groups,
302
+ partial_file_download_params.row_groups_to_download,
303
+ columns=include_columns or column_names,
304
+ )
305
+
306
+ logger.debug(f"Successfully read from s3_url={s3_url} in {latency}s")
307
+
308
+ kwargs = {}
309
+
310
+ if pa_read_func_kwargs_provider:
311
+ kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
312
+
313
+ # Note: ordering is not guaranteed.
314
+ if kwargs.get("schema") is not None:
315
+ input_schema = kwargs.get("schema")
316
+ table_schema = table.schema
317
+
318
+ target_schema = _get_compatible_target_schema(table_schema, input_schema)
319
+ casted_table = table.cast(target_schema)
320
+
321
+ return casted_table
322
+
323
+ return table
324
+
325
+
326
+ def s3_parquet_file_to_table(
327
+ s3_url: str,
328
+ content_type: str,
329
+ content_encoding: str,
330
+ column_names: Optional[List[str]] = None,
331
+ include_columns: Optional[List[str]] = None,
332
+ pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
333
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
334
+ **s3_client_kwargs,
335
+ ) -> pa.Table:
336
+
337
+ logger.debug(
338
+ f"Reading to Parquet table using read_table for {content_type} "
339
+ f"and encoding: {content_encoding}"
340
+ )
341
+
342
+ if s3_client_kwargs is None:
343
+ s3_client_kwargs = {}
344
+
345
+ kwargs = {}
346
+
347
+ if s3_url.startswith("s3://"):
348
+ s3_file_system = create_s3_file_system(s3_client_kwargs)
349
+ kwargs["filesystem"] = s3_file_system
350
+
351
+ _add_column_kwargs(
352
+ content_type=content_type,
353
+ column_names=column_names,
354
+ include_columns=include_columns,
355
+ kwargs=kwargs,
356
+ )
357
+
358
+ if pa_read_func_kwargs_provider:
359
+ kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
360
+
361
+ table, latency = timed_invocation(papq.read_table, s3_url, **kwargs)
362
+
363
+ logger.debug(f"Successfully read the table from url={s3_url} in {latency}s")
364
+ return table
365
+
366
+
240
367
  def s3_file_to_table(
241
368
  s3_url: str,
242
369
  content_type: str,
@@ -244,6 +371,7 @@ def s3_file_to_table(
244
371
  column_names: Optional[List[str]] = None,
245
372
  include_columns: Optional[List[str]] = None,
246
373
  pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
374
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
247
375
  **s3_client_kwargs,
248
376
  ) -> pa.Table:
249
377
 
@@ -253,6 +381,39 @@ def s3_file_to_table(
253
381
  f"Reading {s3_url} to PyArrow. Content type: {content_type}. "
254
382
  f"Encoding: {content_encoding}"
255
383
  )
384
+
385
+ if (
386
+ content_type == ContentType.PARQUET.value
387
+ and content_encoding == ContentEncoding.IDENTITY.value
388
+ ):
389
+ logger.debug(
390
+ f"Performing read using parquet reader for encoding={content_encoding} "
391
+ f"and content_type={content_type}"
392
+ )
393
+ kwargs = {}
394
+ if pa_read_func_kwargs_provider is not None:
395
+ kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
396
+
397
+ if kwargs.get("reader_type", "daft") == "daft":
398
+ parquet_reader_func = daft_s3_file_to_table
399
+ elif partial_file_download_params and isinstance(
400
+ partial_file_download_params, PartialParquetParameters
401
+ ):
402
+ parquet_reader_func = s3_partial_parquet_file_to_table
403
+ else:
404
+ parquet_reader_func = s3_parquet_file_to_table
405
+
406
+ return parquet_reader_func(
407
+ s3_url=s3_url,
408
+ content_type=content_type,
409
+ content_encoding=content_encoding,
410
+ column_names=column_names,
411
+ include_columns=include_columns,
412
+ pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
413
+ partial_file_download_params=partial_file_download_params,
414
+ **s3_client_kwargs,
415
+ )
416
+
256
417
  s3_obj = s3_utils.get_object_at_url(s3_url, **s3_client_kwargs)
257
418
  logger.debug(f"Read S3 object from {s3_url}: {s3_obj}")
258
419
  pa_read_func = CONTENT_TYPE_TO_PA_READ_FUNC[content_type]
@@ -272,6 +433,57 @@ def s3_file_to_table(
272
433
  return table
273
434
 
274
435
 
436
+ def s3_file_to_parquet(
437
+ s3_url: str,
438
+ content_type: str,
439
+ content_encoding: str,
440
+ column_names: Optional[List[str]] = None,
441
+ include_columns: Optional[List[str]] = None,
442
+ pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
443
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
444
+ **s3_client_kwargs,
445
+ ) -> ParquetFile:
446
+ logger.debug(
447
+ f"Reading {s3_url} to PyArrow ParquetFile. "
448
+ f"Content type: {content_type}. Encoding: {content_encoding}"
449
+ )
450
+
451
+ if (
452
+ content_type != ContentType.PARQUET.value
453
+ or content_encoding != ContentEncoding.IDENTITY
454
+ ):
455
+ raise ValidationError(
456
+ f"S3 file with content type: {content_type} and "
457
+ f"content encoding: {content_encoding} cannot be read"
458
+ "into pyarrow.parquet.ParquetFile"
459
+ )
460
+
461
+ if s3_client_kwargs is None:
462
+ s3_client_kwargs = {}
463
+
464
+ kwargs = {}
465
+
466
+ if s3_url.startswith("s3://"):
467
+ s3_file_system = create_s3_file_system(s3_client_kwargs)
468
+ kwargs["filesystem"] = s3_file_system
469
+
470
+ if pa_read_func_kwargs_provider:
471
+ kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
472
+
473
+ logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
474
+
475
+ kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
476
+
477
+ logger.debug(
478
+ f"Reading the file from {s3_url} into ParquetFile with kwargs: {kwargs}"
479
+ )
480
+ pqFile, latency = timed_invocation(ParquetFile, s3_url, **kwargs)
481
+
482
+ logger.debug(f"Time to get {s3_url} into parquet file: {latency}s")
483
+
484
+ return pqFile
485
+
486
+
275
487
  def table_size(table: pa.Table) -> int:
276
488
  return table.nbytes
277
489
 
@@ -1,7 +1,7 @@
1
1
  import copy
2
2
  import itertools
3
3
  from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
4
-
4
+ from deltacat.utils.placement import PlacementGroupConfig
5
5
  import ray
6
6
  from ray._private.ray_constants import MIN_RESOURCE_GRANULARITY
7
7
  from ray.types import ObjectRef
@@ -115,3 +115,28 @@ def round_robin_options_provider(
115
115
  resource_key_index = i % len(resource_keys)
116
116
  key = resource_keys[resource_key_index]
117
117
  return {"resources": {key: resource_amount_provider(resource_key_index)}}
118
+
119
+
120
+ def task_resource_options_provider(
121
+ i: int,
122
+ item: Any,
123
+ resource_amount_provider: Callable[[int, Any], Dict] = lambda x: {},
124
+ pg_config: Optional[PlacementGroupConfig] = None,
125
+ **kwargs,
126
+ ) -> Dict:
127
+ """
128
+ Return options that needs to be provided to each task.
129
+ """
130
+
131
+ options = resource_amount_provider(i, item, **kwargs)
132
+ if pg_config:
133
+ options_to_append = copy.deepcopy(pg_config.opts)
134
+ bundle_key_index = i % len(
135
+ options_to_append["scheduling_strategy"].placement_group.bundle_specs
136
+ )
137
+ options_to_append[
138
+ "scheduling_strategy"
139
+ ].placement_group_bundle_index = bundle_key_index
140
+ options = {**options, **options_to_append}
141
+
142
+ return options
@@ -1,15 +1,20 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ from contextlib import AbstractContextManager
5
+ from types import TracebackType
4
6
  import ray
5
7
  import sys
6
- from typing import Dict, Any
8
+ import threading
9
+ import time
10
+ from typing import Dict, Any, Optional
7
11
  from dataclasses import dataclass
8
12
  from deltacat import logs
9
13
  import logging
10
14
  from resource import getrusage, RUSAGE_SELF
11
15
  import platform
12
16
  import psutil
17
+ import schedule
13
18
 
14
19
 
15
20
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -56,6 +61,72 @@ class ClusterUtilization:
56
61
  )
57
62
 
58
63
 
64
+ class ClusterUtilizationOverTimeRange(AbstractContextManager):
65
+ """
66
+ This class can be used to compute the cluster utilization metrics
67
+ which requires us to compute it over time as they change on-demand.
68
+
69
+ For example, in an autoscaling cluster, the vCPUs keep changing and hence
70
+ more important metrics to capture in that scenario is vcpu-seconds.
71
+ """
72
+
73
+ def __init__(self) -> None:
74
+ self.total_vcpu_seconds = 0.0
75
+ self.used_vcpu_seconds = 0.0
76
+
77
+ def __enter__(self) -> Any:
78
+ schedule.every().second.do(self._update_vcpus)
79
+ self.stop_run_schedules = self._run_schedule()
80
+ return super().__enter__()
81
+
82
+ def __exit__(
83
+ self,
84
+ __exc_type: type[BaseException] | None,
85
+ __exc_value: BaseException | None,
86
+ __traceback: TracebackType | None,
87
+ ) -> bool | None:
88
+ if __exc_value:
89
+ logger.error(
90
+ f"Error ocurred while calculating cluster resources: {__exc_value}"
91
+ )
92
+ self.stop_run_schedules.set()
93
+ return super().__exit__(__exc_type, __exc_value, __traceback)
94
+
95
+ # It is not truely parallel(due to GIL Ref: https://wiki.python.org/moin/GlobalInterpreterLock)
96
+ # even if we are using threading library. However, it averages out and gives a very good approximation.
97
+ def _update_vcpus(self):
98
+ cluster_resources = ray.cluster_resources()
99
+ available_resources = ray.available_resources()
100
+ if "CPU" not in cluster_resources:
101
+ return
102
+
103
+ if "CPU" in available_resources:
104
+ self.used_vcpu_seconds = self.used_vcpu_seconds + float(
105
+ str(cluster_resources["CPU"] - available_resources["CPU"])
106
+ )
107
+ self.total_vcpu_seconds = self.total_vcpu_seconds + float(
108
+ str(cluster_resources["CPU"])
109
+ )
110
+ else:
111
+ self.total_vcpu_seconds = self.total_vcpu_seconds + float(
112
+ str(cluster_resources["CPU"])
113
+ )
114
+
115
+ def _run_schedule(self, interval: Optional[float] = 1.0):
116
+ cease_continuous_run = threading.Event()
117
+
118
+ class ScheduleThread(threading.Thread):
119
+ @classmethod
120
+ def run(cls):
121
+ while not cease_continuous_run.is_set():
122
+ schedule.run_pending()
123
+ time.sleep(float(str(interval)))
124
+
125
+ continuous_thread = ScheduleThread()
126
+ continuous_thread.start()
127
+ return cease_continuous_run
128
+
129
+
59
130
  def get_current_node_peak_memory_usage_in_bytes():
60
131
  """
61
132
  Returns the peak memory usage of the node in bytes. This method works across
deltacat/utils/s3fs.py ADDED
@@ -0,0 +1,21 @@
1
+ import s3fs
2
+
3
+
4
+ def create_s3_file_system(s3_client_kwargs: dict) -> s3fs.S3FileSystem:
5
+ if not s3_client_kwargs:
6
+ return s3fs.S3FileSystem(anon=True)
7
+
8
+ config_kwargs = {}
9
+ if s3_client_kwargs.get("config") is not None:
10
+ boto_config = s3_client_kwargs.pop("config")
11
+ for key, val in boto_config.__dict__.items():
12
+ if not key.startswith("_") and val is not None:
13
+ config_kwargs[key] = val
14
+
15
+ anon = False
16
+ if s3_client_kwargs.get("aws_access_key_id") is None:
17
+ anon = True
18
+
19
+ return s3fs.S3FileSystem(
20
+ anon=anon, client_kwargs=s3_client_kwargs, config_kwargs=config_kwargs or None
21
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.1.18b13
3
+ Version: 0.1.18b15
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -15,17 +15,19 @@ Classifier: Programming Language :: Python :: 3.9
15
15
  Classifier: Operating System :: OS Independent
16
16
  Requires-Python: >=3.7
17
17
  Description-Content-Type: text/markdown
18
- Requires-Dist: boto3 (~=1.20)
19
- Requires-Dist: numpy (==1.21.5)
20
- Requires-Dist: pandas (==1.3.5)
21
- Requires-Dist: pyarrow (==10.0.1)
22
- Requires-Dist: pydantic (==1.10.4)
23
- Requires-Dist: ray[default] (~=2.0)
24
- Requires-Dist: s3fs (==2022.2.0)
25
- Requires-Dist: tenacity (==8.1.0)
26
- Requires-Dist: typing-extensions (==4.4.0)
27
- Requires-Dist: pymemcache (==4.0.0)
28
- Requires-Dist: redis (==4.6.0)
18
+ Requires-Dist: boto3 ~=1.20
19
+ Requires-Dist: numpy ==1.21.5
20
+ Requires-Dist: pandas ==1.3.5
21
+ Requires-Dist: pyarrow ==12.0.1
22
+ Requires-Dist: pydantic ==1.10.4
23
+ Requires-Dist: ray[default] ~=2.0
24
+ Requires-Dist: s3fs ==2022.2.0
25
+ Requires-Dist: tenacity ==8.1.0
26
+ Requires-Dist: typing-extensions ==4.4.0
27
+ Requires-Dist: pymemcache ==4.0.0
28
+ Requires-Dist: redis ==4.6.0
29
+ Requires-Dist: getdaft ==0.1.15
30
+ Requires-Dist: schedule ==1.2.0
29
31
 
30
32
  # DeltaCAT
31
33
 
@@ -40,10 +42,22 @@ for common table management tasks, including petabyte-scale
40
42
  change-data-capture, data consistency checks, and table repair.
41
43
 
42
44
  ## Getting Started
43
- ---
45
+
44
46
  ### Install
47
+
45
48
  ```
46
49
  pip install deltacat
47
50
  ```
48
51
 
52
+ ### Running Tests
53
+
54
+ ```
55
+ pip3 install virtualenv
56
+ virtualenv test_env
57
+ source test_env/bin/activate
58
+ pip3 install -r requirements.txt
59
+
60
+ pytest
61
+ ```
62
+
49
63