deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -1,23 +1,49 @@
1
1
  import logging
2
2
 
3
- import deltacat.logs
4
- from deltacat.types.media import ContentType, ContentEncoding, TableType
5
- from deltacat.types.tables import TableWriteMode
6
- from deltacat.storage import ListResult, Namespace, LifecycleState, \
7
- SchemaConsistencyType, LocalTable, LocalDataset, DistributedDataset
8
- from deltacat.compute.compactor import SortKey, SortOrder
3
+ import deltacat.logs # noqa: F401
4
+ from deltacat.catalog.delegate import (
5
+ alter_namespace,
6
+ alter_table,
7
+ create_namespace,
8
+ create_table,
9
+ default_namespace,
10
+ drop_namespace,
11
+ drop_table,
12
+ get_namespace,
13
+ get_table,
14
+ list_namespaces,
15
+ list_tables,
16
+ namespace_exists,
17
+ read_table,
18
+ refresh_table,
19
+ rename_table,
20
+ table_exists,
21
+ truncate_table,
22
+ write_to_table,
23
+ )
24
+ from deltacat.catalog.model.catalog import ( # noqa: F401
25
+ Catalog,
26
+ Catalogs,
27
+ all_catalogs,
28
+ init,
29
+ )
9
30
  from deltacat.catalog.model.table_definition import TableDefinition
10
- from deltacat.catalog.model.catalog import Catalog, Catalogs, all_catalogs, \
11
- init
12
- from deltacat.catalog.delegate import alter_table, create_table, drop_table, \
13
- refresh_table, list_tables, get_table, truncate_table, rename_table, \
14
- table_exists, list_namespaces, alter_namespace, create_namespace, \
15
- drop_namespace, default_namespace, get_namespace, namespace_exists, \
16
- write_to_table, read_table
31
+ from deltacat.compute.compactor import SortKey, SortOrder
32
+ from deltacat.storage import (
33
+ DistributedDataset,
34
+ LifecycleState,
35
+ ListResult,
36
+ LocalDataset,
37
+ LocalTable,
38
+ Namespace,
39
+ SchemaConsistencyType,
40
+ )
41
+ from deltacat.types.media import ContentEncoding, ContentType, TableType
42
+ from deltacat.types.tables import TableWriteMode
17
43
 
18
- logs.configure_deltacat_logger(logging.getLogger(__name__))
44
+ deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
19
45
 
20
- __version__ = "0.1.10.dev"
46
+ __version__ = "0.1.12"
21
47
 
22
48
 
23
49
  __all__ = [
deltacat/aws/clients.py CHANGED
@@ -1,28 +1,21 @@
1
- import boto3
2
1
  import logging
2
+ from functools import lru_cache
3
+ from typing import Optional
4
+
5
+ import boto3
6
+ from boto3.exceptions import ResourceNotExistsError
3
7
  from boto3.resources.base import ServiceResource
4
8
  from botocore.client import BaseClient
5
9
  from botocore.config import Config
6
- from boto3.exceptions import ResourceNotExistsError
7
- from functools import lru_cache
10
+
8
11
  from deltacat import logs
9
12
  from deltacat.aws.constants import BOTO_MAX_RETRIES
10
13
 
11
- from typing import Optional
12
-
13
14
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
14
15
 
15
16
 
16
- def _resource(
17
- name: str,
18
- region: Optional[str],
19
- **kwargs) -> ServiceResource:
20
- boto_config = Config(
21
- retries={
22
- "max_attempts": BOTO_MAX_RETRIES,
23
- "mode": 'standard'
24
- }
25
- )
17
+ def _resource(name: str, region: Optional[str], **kwargs) -> ServiceResource:
18
+ boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "standard"})
26
19
  return boto3.resource(
27
20
  name,
28
21
  region,
@@ -31,20 +24,14 @@ def _resource(
31
24
  )
32
25
 
33
26
 
34
- def _client(
35
- name: str,
36
- region: Optional[str],
37
- **kwargs) -> BaseClient:
27
+ def _client(name: str, region: Optional[str], **kwargs) -> BaseClient:
38
28
  try:
39
29
  # try to re-use a client from the resource cache first
40
30
  return resource_cache(name, region, **kwargs).meta.client
41
31
  except ResourceNotExistsError:
42
32
  # fall back for clients without an associated resource
43
33
  boto_config = Config(
44
- retries={
45
- "max_attempts": BOTO_MAX_RETRIES,
46
- "mode": 'standard'
47
- }
34
+ retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "standard"}
48
35
  )
49
36
  return boto3.client(
50
37
  name,
@@ -54,19 +41,13 @@ def _client(
54
41
  )
55
42
 
56
43
 
57
- def resource_cache(
58
- name: str,
59
- region: Optional[str],
60
- **kwargs) -> ServiceResource:
44
+ def resource_cache(name: str, region: Optional[str], **kwargs) -> ServiceResource:
61
45
  # we don't use the @lru_cache decorator because Ray can't pickle it
62
46
  cached_function = lru_cache()(_resource)
63
47
  return cached_function(name, region, **kwargs)
64
48
 
65
49
 
66
- def client_cache(
67
- name: str,
68
- region: Optional[str],
69
- **kwargs) -> BaseClient:
50
+ def client_cache(name: str, region: Optional[str], **kwargs) -> BaseClient:
70
51
  # we don't use the @lru_cache decorator because Ray can't pickle it
71
52
  cached_function = lru_cache()(_client)
72
53
  return cached_function(name, region, **kwargs)
deltacat/aws/constants.py CHANGED
@@ -1,6 +1,6 @@
1
- from deltacat.utils.common import env_integer
2
1
  from typing import List
3
2
 
3
+ from deltacat.utils.common import env_integer
4
4
 
5
5
  BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 15)
6
6
  TIMEOUT_ERROR_CODES: List[str] = ["ReadTimeoutError", "ConnectTimeoutError"]
@@ -1,5 +1,10 @@
1
- from deltacat.aws.redshift.model.manifest import Manifest, ManifestAuthor, \
2
- ManifestEntry, ManifestMeta, ManifestEntryList
1
+ from deltacat.aws.redshift.model.manifest import (
2
+ Manifest,
3
+ ManifestAuthor,
4
+ ManifestEntry,
5
+ ManifestEntryList,
6
+ ManifestMeta,
7
+ )
3
8
 
4
9
  __all__ = [
5
10
  "Manifest",
@@ -1,12 +1,12 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- import logging
5
4
  import itertools
5
+ import logging
6
+ from typing import Any, Dict, List, Optional
7
+ from uuid import uuid4
6
8
 
7
9
  from deltacat import logs
8
- from uuid import uuid4
9
- from typing import Any, Dict, List, Optional
10
10
 
11
11
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
12
 
@@ -14,10 +14,11 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
14
14
  class Manifest(dict):
15
15
  @staticmethod
16
16
  def _build_manifest(
17
- meta: Optional[ManifestMeta],
18
- entries: Optional[ManifestEntryList],
19
- author: Optional[ManifestAuthor] = None,
20
- uuid: str = None) -> Manifest:
17
+ meta: Optional[ManifestMeta],
18
+ entries: Optional[ManifestEntryList],
19
+ author: Optional[ManifestAuthor] = None,
20
+ uuid: str = None,
21
+ ) -> Manifest:
21
22
  if not uuid:
22
23
  uuid = str(uuid4())
23
24
  manifest = Manifest()
@@ -31,9 +32,11 @@ class Manifest(dict):
31
32
  return manifest
32
33
 
33
34
  @staticmethod
34
- def of(entries: ManifestEntryList,
35
- author: Optional[ManifestAuthor] = None,
36
- uuid: str = None) -> Manifest:
35
+ def of(
36
+ entries: ManifestEntryList,
37
+ author: Optional[ManifestAuthor] = None,
38
+ uuid: str = None,
39
+ ) -> Manifest:
37
40
  if not uuid:
38
41
  uuid = str(uuid4())
39
42
  total_record_count = 0
@@ -52,15 +55,19 @@ class Manifest(dict):
52
55
  content_encoding = None
53
56
  entry_content_type = meta.content_type
54
57
  if entry_content_type != content_type:
55
- msg = f"Expected all manifest entries to have content " \
56
- f"type '{content_type}' but found " \
57
- f"'{entry_content_type}'"
58
+ msg = (
59
+ f"Expected all manifest entries to have content "
60
+ f"type '{content_type}' but found "
61
+ f"'{entry_content_type}'"
62
+ )
58
63
  raise ValueError(msg)
59
64
  entry_content_encoding = meta["content_encoding"]
60
65
  if entry_content_encoding != content_encoding:
61
- msg = f"Expected all manifest entries to have content " \
62
- f"encoding '{content_encoding}' but found " \
63
- f"'{entry_content_encoding}'"
66
+ msg = (
67
+ f"Expected all manifest entries to have content "
68
+ f"encoding '{content_encoding}' but found "
69
+ f"'{entry_content_encoding}'"
70
+ )
64
71
  raise ValueError(msg)
65
72
  total_record_count += meta.record_count or 0
66
73
  total_content_length += meta.content_length or 0
@@ -70,25 +77,19 @@ class Manifest(dict):
70
77
  total_content_length,
71
78
  content_type,
72
79
  content_encoding,
73
- total_source_content_length
74
- )
75
- manifest = Manifest._build_manifest(
76
- meta,
77
- entries,
78
- author,
79
- uuid
80
+ total_source_content_length,
80
81
  )
82
+ manifest = Manifest._build_manifest(meta, entries, author, uuid)
81
83
  return manifest
82
84
 
83
85
  @staticmethod
84
86
  def merge_manifests(
85
- manifests: List[Manifest],
86
- author: Optional[ManifestAuthor] = None) -> Manifest:
87
+ manifests: List[Manifest], author: Optional[ManifestAuthor] = None
88
+ ) -> Manifest:
87
89
  all_entries = ManifestEntryList(
88
- itertools.chain(*[m.entries for m in manifests]))
89
- merged_manifest = Manifest.of(
90
- all_entries,
91
- author)
90
+ itertools.chain(*[m.entries for m in manifests])
91
+ )
92
+ merged_manifest = Manifest.of(all_entries, author)
92
93
  return merged_manifest
93
94
 
94
95
  @property
@@ -119,14 +120,15 @@ class Manifest(dict):
119
120
 
120
121
  class ManifestMeta(dict):
121
122
  @staticmethod
122
- def of(record_count: Optional[int],
123
- content_length: Optional[int],
124
- content_type: Optional[str],
125
- content_encoding: Optional[str],
126
- source_content_length: Optional[int] = None,
127
- credentials: Optional[Dict[str, str]] = None,
128
- content_type_parameters: Optional[List[Dict[str, str]]] = None) \
129
- -> ManifestMeta:
123
+ def of(
124
+ record_count: Optional[int],
125
+ content_length: Optional[int],
126
+ content_type: Optional[str],
127
+ content_encoding: Optional[str],
128
+ source_content_length: Optional[int] = None,
129
+ credentials: Optional[Dict[str, str]] = None,
130
+ content_type_parameters: Optional[List[Dict[str, str]]] = None,
131
+ ) -> ManifestMeta:
130
132
  manifest_meta = ManifestMeta()
131
133
  if record_count is not None:
132
134
  manifest_meta["record_count"] = record_count
@@ -175,8 +177,7 @@ class ManifestMeta(dict):
175
177
 
176
178
  class ManifestAuthor(dict):
177
179
  @staticmethod
178
- def of(name: Optional[str],
179
- version: Optional[str]) -> ManifestAuthor:
180
+ def of(name: Optional[str], version: Optional[str]) -> ManifestAuthor:
180
181
  manifest_author = ManifestAuthor()
181
182
  if name is not None:
182
183
  manifest_author["name"] = name
@@ -195,15 +196,16 @@ class ManifestAuthor(dict):
195
196
 
196
197
  class ManifestEntry(dict):
197
198
  @staticmethod
198
- def of(url: Optional[str],
199
- meta: Optional[ManifestMeta],
200
- mandatory: bool = True,
201
- uri: Optional[str] = None,
202
- uuid: Optional[str] = None) -> ManifestEntry:
199
+ def of(
200
+ url: Optional[str],
201
+ meta: Optional[ManifestMeta],
202
+ mandatory: bool = True,
203
+ uri: Optional[str] = None,
204
+ uuid: Optional[str] = None,
205
+ ) -> ManifestEntry:
203
206
  manifest_entry = ManifestEntry()
204
207
  if not (uri or url):
205
- raise ValueError(
206
- "No URI or URL specified for manifest entry contents.")
208
+ raise ValueError("No URI or URL specified for manifest entry contents.")
207
209
  if (uri and url) and (uri != url):
208
210
  raise ValueError(f"Manifest entry URI ({uri}) != URL ({url})")
209
211
  if url:
@@ -220,11 +222,13 @@ class ManifestEntry(dict):
220
222
 
221
223
  @staticmethod
222
224
  def from_s3_obj_url(
223
- url: str,
224
- record_count: int,
225
- source_content_length: Optional[int] = None,
226
- **s3_client_kwargs) -> ManifestEntry:
225
+ url: str,
226
+ record_count: int,
227
+ source_content_length: Optional[int] = None,
228
+ **s3_client_kwargs,
229
+ ) -> ManifestEntry:
227
230
  from deltacat.aws import s3u as s3_utils
231
+
228
232
  s3_obj = s3_utils.get_object_at_url(url, **s3_client_kwargs)
229
233
  logger.debug(f"Building manifest entry from {url}: {s3_obj}")
230
234
  manifest_entry_meta = ManifestMeta.of(