deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +188 -218
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +259 -316
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +152 -259
  22. deltacat/compute/compactor/steps/hash_bucket.py +57 -73
  23. deltacat/compute/compactor/steps/materialize.py +138 -99
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +131 -90
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -42
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +8 -10
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  64. deltacat/types/media.py +3 -4
  65. deltacat/types/tables.py +31 -21
  66. deltacat/utils/common.py +5 -11
  67. deltacat/utils/numpy.py +20 -22
  68. deltacat/utils/pandas.py +73 -100
  69. deltacat/utils/performance.py +3 -9
  70. deltacat/utils/placement.py +276 -231
  71. deltacat/utils/pyarrow.py +302 -89
  72. deltacat/utils/ray_utils/collections.py +2 -1
  73. deltacat/utils/ray_utils/concurrency.py +38 -32
  74. deltacat/utils/ray_utils/dataset.py +28 -28
  75. deltacat/utils/ray_utils/performance.py +5 -9
  76. deltacat/utils/ray_utils/runtime.py +9 -10
  77. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
  78. deltacat-0.1.11.dist-info/RECORD +110 -0
  79. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
  80. deltacat/autoscaler/events/__init__.py +0 -0
  81. deltacat/autoscaler/events/compaction/__init__.py +0 -0
  82. deltacat/autoscaler/events/compaction/cluster.py +0 -82
  83. deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
  84. deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
  85. deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
  86. deltacat/autoscaler/events/compaction/input.py +0 -27
  87. deltacat/autoscaler/events/compaction/process.py +0 -25
  88. deltacat/autoscaler/events/compaction/session_manager.py +0 -13
  89. deltacat/autoscaler/events/compaction/utils.py +0 -216
  90. deltacat/autoscaler/events/compaction/workflow.py +0 -303
  91. deltacat/autoscaler/events/dispatcher.py +0 -95
  92. deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
  93. deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
  94. deltacat/autoscaler/events/event_store.py +0 -55
  95. deltacat/autoscaler/events/exceptions.py +0 -6
  96. deltacat/autoscaler/events/processor.py +0 -177
  97. deltacat/autoscaler/events/session_manager.py +0 -25
  98. deltacat/autoscaler/events/states.py +0 -88
  99. deltacat/autoscaler/events/workflow.py +0 -54
  100. deltacat/autoscaler/node_group.py +0 -230
  101. deltacat/autoscaler/utils.py +0 -69
  102. deltacat-0.1.8.dist-info/RECORD +0 -131
  103. /deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
  104. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
  105. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,26 +1,25 @@
1
1
  import logging
2
+ from typing import Callable, Dict, List, Optional
2
3
 
3
- from pyarrow import csv as pacsv
4
4
  from fsspec import AbstractFileSystem
5
-
5
+ from pyarrow import csv as pacsv
6
6
  from ray.data import Dataset
7
7
  from ray.data.datasource import BlockWritePathProvider
8
8
 
9
9
  from deltacat import logs
10
- from deltacat.types.media import ContentType, ContentEncoding
11
-
12
- from typing import Callable, Dict, List, Optional
10
+ from deltacat.types.media import ContentEncoding, ContentType
13
11
 
14
12
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
13
 
16
14
 
17
15
  def write_parquet(
18
- dataset: Dataset,
19
- base_path: str,
20
- *,
21
- filesystem: AbstractFileSystem,
22
- block_path_provider: BlockWritePathProvider,
23
- **kwargs) -> None:
16
+ dataset: Dataset,
17
+ base_path: str,
18
+ *,
19
+ filesystem: AbstractFileSystem,
20
+ block_path_provider: BlockWritePathProvider,
21
+ **kwargs,
22
+ ) -> None:
24
23
 
25
24
  dataset.write_parquet(
26
25
  base_path,
@@ -32,12 +31,13 @@ def write_parquet(
32
31
 
33
32
 
34
33
  def write_csv(
35
- dataset: Dataset,
36
- base_path: str,
37
- *,
38
- filesystem: AbstractFileSystem,
39
- block_path_provider: BlockWritePathProvider,
40
- **kwargs) -> None:
34
+ dataset: Dataset,
35
+ base_path: str,
36
+ *,
37
+ filesystem: AbstractFileSystem,
38
+ block_path_provider: BlockWritePathProvider,
39
+ **kwargs,
40
+ ) -> None:
41
41
 
42
42
  # column names are kept in table metadata, so omit header
43
43
  arrow_csv_args_fn = lambda: {
@@ -61,9 +61,7 @@ CONTENT_TYPE_TO_DATASET_WRITE_FUNC: Dict[str, Callable] = {
61
61
  }
62
62
 
63
63
 
64
- def slice_dataset(
65
- dataset: Dataset,
66
- max_len: Optional[int]) -> List[Dataset]:
64
+ def slice_dataset(dataset: Dataset, max_len: Optional[int]) -> List[Dataset]:
67
65
  """
68
66
  Returns equally-sized dataset slices of up to `max_len` records each.
69
67
  """
@@ -88,12 +86,13 @@ def dataset_size(dataset: Dataset) -> int:
88
86
 
89
87
 
90
88
  def dataset_to_file(
91
- table: Dataset,
92
- base_path: str,
93
- file_system: AbstractFileSystem,
94
- block_path_provider: BlockWritePathProvider,
95
- content_type: str = ContentType.PARQUET.value,
96
- **kwargs) -> None:
89
+ table: Dataset,
90
+ base_path: str,
91
+ file_system: AbstractFileSystem,
92
+ block_path_provider: BlockWritePathProvider,
93
+ content_type: str = ContentType.PARQUET.value,
94
+ **kwargs,
95
+ ) -> None:
97
96
  """
98
97
  Writes the given Distributed Dataset to one or more files.
99
98
  """
@@ -102,11 +101,12 @@ def dataset_to_file(
102
101
  raise NotImplementedError(
103
102
  f"Distributed Dataset writer for content type '{content_type}' not"
104
103
  f" implemented. Known content types: "
105
- f"{CONTENT_TYPE_TO_DATASET_WRITE_FUNC.keys}")
104
+ f"{CONTENT_TYPE_TO_DATASET_WRITE_FUNC.keys}"
105
+ )
106
106
  writer(
107
107
  table,
108
108
  base_path,
109
109
  filesystem=file_system,
110
110
  block_path_provider=block_path_provider,
111
- **kwargs
111
+ **kwargs,
112
112
  )
@@ -1,20 +1,16 @@
1
1
  import time
2
- from deltacat.utils.ray_utils.collections import DistributedCounter
3
2
  from typing import Any, Callable, Tuple
4
3
 
4
+ from deltacat.utils.ray_utils.collections import DistributedCounter
5
+
5
6
 
6
7
  def invoke_with_perf_counter(
7
- counter: DistributedCounter,
8
- counter_key: Any,
9
- func: Callable,
10
- *args,
11
- **kwargs) -> Tuple[Any, float]:
8
+ counter: DistributedCounter, counter_key: Any, func: Callable, *args, **kwargs
9
+ ) -> Tuple[Any, float]:
12
10
 
13
11
  start = time.perf_counter()
14
12
  result = func(*args, **kwargs)
15
13
  stop = time.perf_counter()
16
14
  latency = stop - start
17
- counter.increment.remote(
18
- counter_key,
19
- latency)
15
+ counter.increment.remote(counter_key, latency)
20
16
  return result, latency
@@ -1,17 +1,17 @@
1
- import ray
2
1
  import logging
3
2
  import time
3
+ from typing import Any, Callable, Dict, List
4
4
 
5
- from deltacat import logs
5
+ import ray
6
6
 
7
- from typing import Any, Callable, Dict, List
7
+ from deltacat import logs
8
8
 
9
9
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
10
10
 
11
11
 
12
12
  def node_resource_keys(
13
- filter_fn: Callable[[Dict[str, Any]], bool] = lambda n: True) \
14
- -> List[str]:
13
+ filter_fn: Callable[[Dict[str, Any]], bool] = lambda n: True
14
+ ) -> List[str]:
15
15
  """Get all Ray resource keys for cluster nodes that pass the given filter
16
16
  as a list of strings of the form: "node:{node_resource_name}". The returned
17
17
  keys can be used to place tasks or actors on that node via:
@@ -39,8 +39,9 @@ def current_node_resource_key() -> str:
39
39
  """
40
40
  current_node_id = ray.get_runtime_context().node_id.hex()
41
41
  keys = node_resource_keys(lambda n: n["NodeID"] == current_node_id)
42
- assert len(keys) <= 1, \
43
- f"Expected <= 1 keys for the current node, but found {len(keys)}"
42
+ assert (
43
+ len(keys) <= 1
44
+ ), f"Expected <= 1 keys for the current node, but found {len(keys)}"
44
45
  return keys[0] if len(keys) == 1 else None
45
46
 
46
47
 
@@ -55,9 +56,7 @@ def live_node_count() -> int:
55
56
  return sum(1 for n in ray.nodes() if is_node_alive(n))
56
57
 
57
58
 
58
- def live_node_waiter(
59
- min_live_nodes: int,
60
- poll_interval_seconds: float = 0.5) -> None:
59
+ def live_node_waiter(min_live_nodes: int, poll_interval_seconds: float = 0.5) -> None:
61
60
  """Waits until the given minimum number of live nodes are present in the
62
61
  cluster. Checks the current number of live nodes every
63
62
  `poll_interval_seconds`."""
@@ -1,9 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.1.8
3
+ Version: 0.1.11
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
7
+ License: UNKNOWN
8
+ Platform: UNKNOWN
7
9
  Classifier: Development Status :: 4 - Beta
8
10
  Classifier: Intended Audience :: Developers
9
11
  Classifier: Programming Language :: Python :: 3 :: Only
@@ -13,25 +15,33 @@ Classifier: Programming Language :: Python :: 3.9
13
15
  Classifier: Operating System :: OS Independent
14
16
  Requires-Python: >=3.7
15
17
  Description-Content-Type: text/markdown
16
- License-File: LICENSE
17
- Requires-Dist: s3fs (==2022.1.0)
18
- Requires-Dist: tenacity (==8.0.1)
19
- Requires-Dist: ray[default] (==2.0.0)
20
- Requires-Dist: pandas (>=1.3.1)
21
- Requires-Dist: pyarrow (==8.0.0)
22
- Requires-Dist: pydantic (==1.10.2)
23
- Requires-Dist: numpy (>=1.21.1)
24
18
  Requires-Dist: boto3 (==1.20.24)
19
+ Requires-Dist: numpy (==1.21.5)
20
+ Requires-Dist: pandas (==1.3.5)
21
+ Requires-Dist: pyarrow (==10.0.1)
22
+ Requires-Dist: pydantic (==1.10.4)
23
+ Requires-Dist: ray[default] (==2.0.0)
24
+ Requires-Dist: s3fs (==2022.2.0)
25
+ Requires-Dist: tenacity (==8.1.0)
25
26
  Requires-Dist: typing-extensions (==4.4.0)
26
27
 
27
28
  # DeltaCAT
28
29
 
29
30
  DeltaCAT is a Pythonic Data Catalog powered by Ray.
30
31
 
31
- Its data storage model allows you to define and manage fast, scalable,
32
- ACID-compliant data catalogs through git-like stage/commit APIs, and has been
32
+ Its data storage model allows you to define and manage fast, scalable,
33
+ ACID-compliant data catalogs through git-like stage/commit APIs, and has been
33
34
  used to successfully host exabyte-scale enterprise data lakes.
34
35
 
35
36
  DeltaCAT uses the Ray distributed compute framework together with Apache Arrow
36
- for common table management tasks, including petabyte-scale
37
+ for common table management tasks, including petabyte-scale
37
38
  change-data-capture, data consistency checks, and table repair.
39
+
40
+ ## Getting Started
41
+ ---
42
+ ### Install
43
+ ```
44
+ pip install deltacat
45
+ ```
46
+
47
+
@@ -0,0 +1,110 @@
1
+ deltacat/__init__.py,sha256=gF2hBR7_JIL4gQRa1JN-fDraLLLmwXisIOcZLpbwTCM,1808
2
+ deltacat/constants.py,sha256=E_1hOQolyvJCWB8eIVWtlAMgk2dmXGyXBhW05czilwQ,1173
3
+ deltacat/exceptions.py,sha256=x7qem7FLujXf-DzPsNcQ-XYkW3cF3A0YGIbxkcpz0Mw,146
4
+ deltacat/logs.py,sha256=T_-_JwOZFRSV64-KvmhMf-dInvYvuO4CSN-1EDrJJsU,5808
5
+ deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ deltacat/aws/clients.py,sha256=gBwSjCUfzQyiq-empApD685S-FhVlTjdJtDIWpR_alg,1743
7
+ deltacat/aws/constants.py,sha256=4OnwC1H12FPs2bpVN7tXkxn-DAEJS4MYRrFh5HKFv7s,204
8
+ deltacat/aws/s3u.py,sha256=KTAG9uNCYpANG-rWNAByhF062bpEnCqFP-_Crp1y6dA,17371
9
+ deltacat/aws/redshift/__init__.py,sha256=fjuv3jWdPE8IgF4uSrL0YEqV3XUfqDULX3xV27ICceo,266
10
+ deltacat/aws/redshift/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ deltacat/aws/redshift/model/manifest.py,sha256=E71avhRHQGEZ6It3-axCB5FdiieQhSMu9Wt8oZAdXro,9519
12
+ deltacat/catalog/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ deltacat/catalog/delegate.py,sha256=yuMh8vcXwYBcaMO9HYasbj4DHJIl6Y5xZ5Qd2kTT278,8755
14
+ deltacat/catalog/interface.py,sha256=A3Mr5tOBEG4VgDJuzrt5XEwrbNxYZPWHq34TAJvnX5M,6566
15
+ deltacat/catalog/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ deltacat/catalog/model/catalog.py,sha256=-Ho7a3rV1hiOS9cSRCAor9AtXV9nJn9t_MDVql9pIxo,2212
17
+ deltacat/catalog/model/table_definition.py,sha256=tKrM1mmaQlvxqXrLt3QJVZK5BZfaJnhjTZ6KjybYlhE,727
18
+ deltacat/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ deltacat/compute/compactor/__init__.py,sha256=5wjMMS0-22weCtKZe76dQwT6YGFCYgLtvhsb2gd5a8M,1078
20
+ deltacat/compute/compactor/compaction_session.py,sha256=JJHZZmsQ4bOXYnmazomlnd64kcDCEdsfN7TMXRIqYvs,21182
21
+ deltacat/compute/compactor/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ deltacat/compute/compactor/model/delta_annotated.py,sha256=BFnyeoeLkSwCDsfzipENgRBU8x8GiudSqZj946H8agY,7703
23
+ deltacat/compute/compactor/model/delta_file_envelope.py,sha256=SfmdEPbfCfUekV_NvlWcyQOYo1CEHcRCGXHWHqmYQYg,1835
24
+ deltacat/compute/compactor/model/delta_file_locator.py,sha256=Cc-YzxxyrXK6FlY8ek2L92XzfT0qkMCxs6yrC_FsEwU,1766
25
+ deltacat/compute/compactor/model/materialize_result.py,sha256=b1Pwa89fgvr7rX3uSWwIt2ld-ElmqOSu-BXkZ1wwXdA,1253
26
+ deltacat/compute/compactor/model/primary_key_index.py,sha256=MT4zqwhzh3e9qZotWvZavT_MtWXm_81ojfcOCv1t17w,10459
27
+ deltacat/compute/compactor/model/pyarrow_write_result.py,sha256=WYIa0DRcyaemR6yUS8_8RLQ2voTmCVNFUL99qxPmt70,1324
28
+ deltacat/compute/compactor/model/round_completion_info.py,sha256=3s0rAjJoV_IZ9OBe6KxopOijte2cS4khS2Nuw-Q2NQ8,3041
29
+ deltacat/compute/compactor/model/sort_key.py,sha256=XDIoYrV18FciomV5yWxu1OaDsD78trmUUtseyRurIKo,4124
30
+ deltacat/compute/compactor/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ deltacat/compute/compactor/steps/dedupe.py,sha256=cSmgeTYm0lQvteNZhvRFJjX0zeBJd2-gvnY9smsVmzk,11044
32
+ deltacat/compute/compactor/steps/hash_bucket.py,sha256=XIPudRw4a9l7cCjfIQhYz_szktaY4tD6pcHRNBIh-HM,5809
33
+ deltacat/compute/compactor/steps/materialize.py,sha256=8xsKBhBxankFLas6Ay97KeYcwyFlCs82UieBTLneTTE,8786
34
+ deltacat/compute/compactor/steps/rehash/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
+ deltacat/compute/compactor/steps/rehash/rehash_bucket.py,sha256=yh-sBuUI3hqw2vk_nK9o-KDrgSww4oSvAz2hBxTkv8s,1765
36
+ deltacat/compute/compactor/steps/rehash/rewrite_index.py,sha256=-HVM08pk5ROHEgDP-FVty55-a_0dsGRiSnPlNJw7C6Q,1838
37
+ deltacat/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
+ deltacat/compute/compactor/utils/io.py,sha256=QMyh-oh4EqpN-lnurxJAS50IvY5LNoWHT6o6KhEPDiw,9637
39
+ deltacat/compute/compactor/utils/primary_key_index.py,sha256=taYw1AjGIFlD9c8OXyj9ps816a15B61aoV4I00EAUyo,12072
40
+ deltacat/compute/compactor/utils/round_completion_file.py,sha256=F4HwEG3egg59w4eiSvwrYImcDWgk56oXOyprnuYZYdE,2078
41
+ deltacat/compute/compactor/utils/system_columns.py,sha256=ge4cL0RVsZ-9vTyU0xErnB-ClVASF5_CxsOTnAXnpfc,7106
42
+ deltacat/compute/metastats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
+ deltacat/compute/metastats/meta_stats.py,sha256=-Fb0yQAdUUgm2IShcWlPZto-qdivF-nK05sQqJu7K5s,18588
44
+ deltacat/compute/metastats/stats.py,sha256=-aFFrh7b--PzvQWNJG5_PgdN7ZM1bmGMeha5khwxhNw,7285
45
+ deltacat/compute/metastats/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
+ deltacat/compute/metastats/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
+ deltacat/compute/metastats/model/partition_stats_dict.py,sha256=FbfoOxmTZfjRT7iHwc_96gHmB_r6iUvVM9BoTldD5mY,1123
48
+ deltacat/compute/metastats/model/stats_cluster_size_estimator.py,sha256=AfH2rsC1DdJ2R_CwOPgjGJ04h-yWROsMfTw83GdpGXM,2849
49
+ deltacat/compute/metastats/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
+ deltacat/compute/metastats/utils/constants.py,sha256=bFUPKmR3FkjEnwpHuToQYZ9QcHqYpd4OMMSwVwnJcaA,869
51
+ deltacat/compute/metastats/utils/io.py,sha256=F9bY0Wo-qeokBLn5eXN9zIV2duLTXO5aNUMbL3_Ae2U,8825
52
+ deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py,sha256=-3utoiC9fP2UFiJ-u7KbESNiHCRVzh5NGtSld0xRXX0,1143
53
+ deltacat/compute/metastats/utils/ray_utils.py,sha256=sEDzcA0K8DMbQ_i8axBCQiPRrySPM14piaTqzKqhkss,4516
54
+ deltacat/compute/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
+ deltacat/compute/stats/basic.py,sha256=m_tDdtLbsyyky-UJ0UULBZDoAAjYr02O0sSvFCKyHGk,8837
56
+ deltacat/compute/stats/types.py,sha256=cp0lT8nITTKbnkc03OysRjXfcfXzQml9a4wqCnR6kqs,215
57
+ deltacat/compute/stats/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
+ deltacat/compute/stats/models/delta_column_stats.py,sha256=-wXjB2c0BC1RDheumjL_j5-DfRNql4WsK9GpMFQI1cg,3300
59
+ deltacat/compute/stats/models/delta_stats.py,sha256=hBith8_hbF9TVr6HocLAt6RJ_kZZKO4zrGP8VOP05vA,8556
60
+ deltacat/compute/stats/models/delta_stats_cache_result.py,sha256=mbJYxpZd5jaER_BWrCD2hROFy3p1nNdBrj66nUpc6io,1624
61
+ deltacat/compute/stats/models/manifest_entry_stats.py,sha256=NCDAe2nPDEI4kOkuwNkRFgGPS-rqQaQqLuaLoKk20KQ,2419
62
+ deltacat/compute/stats/models/stats_result.py,sha256=XQAlmzhUqRmg4jzEMUAOqcYn1HUOBTMryBH1CCVlet8,3820
63
+ deltacat/compute/stats/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
+ deltacat/compute/stats/utils/intervals.py,sha256=9ezOzIrBGU1fWBuAn1CorJ3uX5COU7vxrfA7kI1cB7I,3094
65
+ deltacat/compute/stats/utils/io.py,sha256=ZXpntXqa41l5bxxAa2vcTW5mVpWeBIvd3QA9VWnX-aw,8573
66
+ deltacat/compute/stats/utils/manifest_stats_file.py,sha256=PtqW5Zc5e09HcfiAgvoZHVMJ2gamGdwmynMXOJNJUaY,3693
67
+ deltacat/io/__init__.py,sha256=5Al7BPSaQghEp1K3PfiKIJJ0HR6MUuaN7HTMyM_9lf4,154
68
+ deltacat/io/dataset.py,sha256=8w9sPVDpGnjjGVDWB39YSKWxq4zRv9VEfDtj7PYwjqM,3755
69
+ deltacat/io/read_api.py,sha256=BhkjL3xjY-fsa62AA9Yv20_88uTskn4_Bv2W6VmMXVA,7023
70
+ deltacat/io/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
+ deltacat/io/aws/redshift/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
+ deltacat/io/aws/redshift/redshift_datasource.py,sha256=X183O4tgBqtaZOSFmMFvp-9mv8NX5kGvRvX0eoSX8rA,22599
73
+ deltacat/storage/__init__.py,sha256=ElzZuG5zrX9nUIe7f0Sp21WDX7yBoclclq3TIL-doag,1371
74
+ deltacat/storage/interface.py,sha256=pw8t0jCqPakw13wDpg_cW0eBGBpqG1GO0djg-ZNNW6Q,21133
75
+ deltacat/storage/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
+ deltacat/storage/model/delta.py,sha256=bmcG1rF6mwUdM3YHh6M9gLV6uqxbwHZVNS3WHkXFeDw,13734
77
+ deltacat/storage/model/list_result.py,sha256=FgD6oYeKo0EPe8z7jC8T4pAFjBOuBwd4axxGrnYyBG4,2466
78
+ deltacat/storage/model/locator.py,sha256=1S7szmDSx-R4Z3arFNILOvS4t7dF7_rJNV9fHyRc3G4,1296
79
+ deltacat/storage/model/namespace.py,sha256=KI2umYWShXFTx1ykLwsQjuce078WYo_Hmavn3DDeBzE,2086
80
+ deltacat/storage/model/partition.py,sha256=6Sknqi2GhtaSpkM--3oMjR9agRLHS4i7izFWM4iiGmY,11068
81
+ deltacat/storage/model/stream.py,sha256=XZ-c4EQR89NWydEOEG5GCaT8zST10OmjLZBKHZPdrzA,7738
82
+ deltacat/storage/model/table.py,sha256=IOu1ZOrdRkVDB-FOxYMRvnNf5TukIDfbdHWTqHYN_OY,4225
83
+ deltacat/storage/model/table_version.py,sha256=j57er3zlN0_2kwVMpWZ3iouABO-Kl8_Txi0UWIZ0dtk,7034
84
+ deltacat/storage/model/types.py,sha256=LQPe_CxcoW2N67Leu3fNbnSXhbl9ubDa8LVvBY0JUiY,1580
85
+ deltacat/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
86
+ deltacat/tests/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
+ deltacat/tests/stats/test_intervals.py,sha256=S92DgkALQ1WmbLWcxtvS7RlVGvL-XoPJKUUbkdn9_CQ,1955
88
+ deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
89
+ deltacat/tests/utils/test_record_batch_tables.py,sha256=yLExx5jZfi65uSjkdhOCGnP7Km6zWqKCzmULf1PEKA0,11322
90
+ deltacat/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
+ deltacat/types/media.py,sha256=py1BnfMqNpJlW1RKzHWwB0NmQ33oCk9qg1fz7alvi3E,2187
92
+ deltacat/types/tables.py,sha256=yUzkzmUij8kssEYI_dfVDSLXf8HfMm_jpgWkPxDHAas,3893
93
+ deltacat/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
+ deltacat/utils/common.py,sha256=RG_-enXNpLKaYrqyx1ne2lL10lxN9vK7F631oJP6SE8,1375
95
+ deltacat/utils/numpy.py,sha256=ZiGREobTVT6IZXgPxkSUpLJFN2Hn8KEZcrqybLDXCIA,2027
96
+ deltacat/utils/pandas.py,sha256=eGOpiZE1zLznTtuwoN80j4PBp1_bUV8SE4c951r0a3o,9561
97
+ deltacat/utils/performance.py,sha256=rC3CPfroZP3T5TbRNZXB9GRBr0F9i2KUeZYL45JBgCU,610
98
+ deltacat/utils/placement.py,sha256=JE6OsW16VonlMhdH5B2IYuLJxItoYguaKpZNgbpMNLw,11066
99
+ deltacat/utils/pyarrow.py,sha256=Xf7KKTlA6wPJXcd_Uopm6iTSM9IlZ0M6Ajr4tWJP8OU,18230
100
+ deltacat/utils/ray_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
+ deltacat/utils/ray_utils/collections.py,sha256=hj20s4D2RF2jZETU_44r6mFbsczA0JI_I_4kWKTmqes,1951
102
+ deltacat/utils/ray_utils/concurrency.py,sha256=AyL7hpvYjkmsz-KcpYjVgPpNsmu-x8-rlLyG0qXoV_c,5123
103
+ deltacat/utils/ray_utils/dataset.py,sha256=SIljK3UkSqQ6Ntit_iSiYt9yYjN_gGrCTX6_72XdQ3w,3244
104
+ deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
105
+ deltacat/utils/ray_utils/runtime.py,sha256=pUCSt2Fo8iMrvjmbkQdFtgSxZW9PA05H1molItzr5Rc,4786
106
+ deltacat-0.1.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
107
+ deltacat-0.1.11.dist-info/METADATA,sha256=Ce07iEuy13s5C0jOpFDUkcL8jhDWy8BvWTP4GHPWDdw,1493
108
+ deltacat-0.1.11.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
109
+ deltacat-0.1.11.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
110
+ deltacat-0.1.11.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.37.1)
2
+ Generator: bdist_wheel (0.38.4)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
File without changes
File without changes
@@ -1,82 +0,0 @@
1
- from typing import Dict, Any
2
-
3
- import math
4
-
5
-
6
- MB_IN_BYTES = 1_000_000
7
-
8
-
9
- class ClusterSizeSuggester:
10
- def __init__(self,
11
- cluster_memory_bytes: float = None,
12
- heap_memory_alloc_ratio: float = 0.7, # Ray defaults
13
- object_store_memory_alloc_ratio: float = 0.3, # Ray defaults
14
- instance_type: str = "r5.8xlarge"):
15
- """Given the total required memory constraints, constructs a helper class that
16
- recommends an instance type, number of instance nodes, max usable heap and object store memory,
17
-
18
- Args:
19
- cluster_memory_bytes: Total memory needed for the cluster.
20
- If not provided, defaults to memory size of instance type.
21
- heap_memory_alloc_ratio: Optional. Set to 0.7 by default.
22
- object_store_memory_alloc_ratio: Optional. Set to 0.3 by default.
23
- instance_type: Optional. Set to r5.8xlarge by default, to allow for up to 8GB of memory per vCPU.
24
- # TODO: suggest various r5 instance types based on memory input
25
- """
26
- self._instance_type = instance_type
27
- self.cluster_memory_bytes = cluster_memory_bytes if cluster_memory_bytes else self.get_node_memory_size()
28
- self.heap_memory_alloc_ratio = heap_memory_alloc_ratio
29
- self.object_store_memory_alloc_ratio = object_store_memory_alloc_ratio
30
-
31
- @property
32
- def instance_type(self):
33
- return self._instance_type
34
-
35
- def get_instance_type_specifications(self) -> Dict[str, Any]:
36
- """Assumes r5.8xlarge instances (for now)
37
-
38
- Returns: a dict of hardware details
39
-
40
- """
41
- # TODO: call ec2 describe-instance-types to extract hardware details (vCPUs, memory, network bandwidth).
42
- # Current implementation assumes we only serve r5.8xlarge node types.
43
- if self.instance_type == "r5.8xlarge":
44
- return {
45
- # Intentionally mimic the output format of describe-instance-types API
46
- "VCpuInfo": {
47
- "DefaultVCpus": 32,
48
- "DefaultCores": 16,
49
- "DefaultThreadsPerCore": 2,
50
- },
51
- "MemoryInfo": {
52
- "SizeInMiB": 262144
53
- }
54
- }
55
-
56
- def get_num_vcpu_per_node(self):
57
- spec = self.get_instance_type_specifications()
58
- return spec["VCpuInfo"]["DefaultVCpus"]
59
-
60
- def get_node_memory_size(self):
61
- spec = self.get_instance_type_specifications()
62
- return spec["MemoryInfo"]["SizeInMiB"] * MB_IN_BYTES
63
-
64
- def get_max_memory_per_vcpu(self):
65
- return self.get_node_memory_size() / self.get_num_vcpu_per_node()
66
-
67
- def get_node_max_object_store_memory(self):
68
- return self.object_store_memory_alloc_ratio * self.get_node_memory_size()
69
-
70
- def get_node_max_heap_memory(self):
71
- return self.heap_memory_alloc_ratio * self.get_node_memory_size()
72
-
73
- def get_suggested_vcpu_count(self):
74
- return self.cluster_memory_bytes / self.get_max_memory_per_vcpu()
75
-
76
- def get_suggested_node_size(self):
77
- return math.ceil(self.cluster_memory_bytes / (self.get_num_vcpu_per_node() * self.get_max_memory_per_vcpu()))
78
-
79
-
80
- class InstanceTypeSuggester:
81
- def __init__(self):
82
- raise NotImplementedError("Instance Type Suggester is not implemented.")
@@ -1,36 +0,0 @@
1
- from typing import List, Iterable
2
-
3
-
4
- class PartitionKeyValue(dict):
5
- @staticmethod
6
- def of(key_name: str,
7
- key_type: str,
8
- value: str):
9
- pkv = PartitionKeyValue()
10
- pkv["key"] = {}
11
- pkv["key"]["keyName"] = key_name
12
- pkv["key"]["keyType"] = key_type
13
- pkv["value"] = value
14
- return pkv
15
-
16
- @property
17
- def key(self):
18
- return self["key"]
19
-
20
- @property
21
- def key_name(self):
22
- return self["key"]["keyName"]
23
-
24
- @property
25
- def key_type(self):
26
- return self["key"]["keyType"]
27
-
28
- @property
29
- def value(self):
30
- return self["value"]
31
-
32
-
33
- class PartitionKeyValues(tuple):
34
- @property
35
- def id(self):
36
- return ".".join([pkv.value for pkv in self if isinstance(pkv, PartitionKeyValue)])
@@ -1,28 +0,0 @@
1
- import logging
2
-
3
- from deltacat.autoscaler.events.compaction.session_manager import CompactionSessionManager
4
- from deltacat.autoscaler.events.dispatcher import EventDispatcher
5
- from ray.autoscaler._private.aws.events import EventPublisher
6
-
7
- from deltacat import logs
8
- from deltacat.storage import interface as unimplemented_deltacat_storage
9
-
10
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
11
-
12
-
13
- class CompactionEventDispatcher(EventDispatcher):
14
- def __init__(self,
15
- events_publisher: EventPublisher,
16
- deltacat_storage: unimplemented_deltacat_storage,
17
- session_manager: CompactionSessionManager = None):
18
- """Constructor for the event dispatcher.
19
-
20
- Intended for usage by Ray parent and child clusters running compaction jobs.
21
-
22
- Args:
23
- events_publisher: Events manager for publishing events through a cloud provider
24
- deltacat_storage: Storage interface for deltacat
25
- session_manager: Manager for launching child Ray sessions
26
- """
27
- super().__init__(events_publisher, deltacat_storage, session_manager)
28
- self.session_manager = session_manager
@@ -1,27 +0,0 @@
1
- from typing import Set, NamedTuple, List, Optional, Dict
2
-
3
- from deltacat import ContentType, SortKey
4
- from deltacat.autoscaler.events.compaction.collections.partition_key_value import PartitionKeyValues
5
- from deltacat.compute.stats.models.delta_stats import DeltaStats
6
- from deltacat.storage import PartitionLocator, interface as unimplemented_deltacat_storage
7
-
8
- import pyarrow as pa
9
-
10
-
11
- class CompactionInput(NamedTuple):
12
- source_partition_locator: PartitionLocator
13
- compacted_partition_locator: PartitionLocator
14
- primary_keys: Set[str]
15
- compaction_artifact_s3_bucket: str
16
- last_stream_position_to_compact: int
17
- hash_bucket_count: Optional[int] = None
18
- sort_keys: List[SortKey] = None
19
- records_per_primary_key_index_file: int = 38_000_000
20
- records_per_compacted_file: int = 4_000_000
21
- input_deltas_stats: Optional[Dict[int, DeltaStats]] = None
22
- min_hash_bucket_chunk_size: int = 0
23
- compacted_file_content_type: ContentType = ContentType.PARQUET
24
- delete_prev_primary_key_index: bool = False
25
- schema_on_read: Optional[pa.schema] = None
26
- deltacat_storage = unimplemented_deltacat_storage
27
- partition_key_values: PartitionKeyValues = None
@@ -1,25 +0,0 @@
1
- from typing import Optional
2
-
3
- from deltacat.autoscaler.events.compaction.collections.partition_key_value import PartitionKeyValues
4
- from deltacat.storage import PartitionLocator
5
-
6
-
7
- class CompactionProcess:
8
- def __init__(self,
9
- partition_locator: PartitionLocator,
10
- compaction_cluster_config_path: str,
11
- hash_bucket_count: Optional[int] = None,
12
- last_stream_position_to_compact: Optional[int] = None,
13
- partition_key_values: PartitionKeyValues = None,
14
- cluster_memory_bytes: Optional[int] = None,
15
- input_delta_total_bytes: Optional[int] = None):
16
- self.partition_locator = partition_locator
17
- self.compaction_cluster_config_path = compaction_cluster_config_path
18
- self.hash_bucket_count = hash_bucket_count
19
- self.last_stream_position_to_compact = last_stream_position_to_compact
20
- self.partition_values = partition_key_values
21
- self.cluster_memory_bytes = cluster_memory_bytes
22
- self.input_delta_total_bytes = input_delta_total_bytes
23
- self.id = ".".join([pkv.value for pkv in partition_key_values])
24
-
25
-
@@ -1,13 +0,0 @@
1
- from typing import List
2
-
3
- from deltacat.autoscaler.events.compaction.process import CompactionProcess
4
- from deltacat.autoscaler.events.session_manager import SessionManager
5
- from deltacat.storage import PartitionLocator
6
-
7
-
8
- class CompactionSessionManager(SessionManager):
9
- def launch_stats_metadata_collection(self, source_partition_locators: List[PartitionLocator]):
10
- raise NotImplementedError("Stats Metadata Collection is not implemented.")
11
-
12
- def launch_compaction(self, compaction_processes: List[CompactionProcess]):
13
- raise NotImplementedError("Compaction is not implemented.")