deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/clients.py +17 -6
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/compute/compactor/compaction_session.py +42 -18
  6. deltacat/compute/compactor/model/compact_partition_params.py +297 -58
  7. deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
  8. deltacat/compute/compactor/model/delta_annotated.py +95 -9
  9. deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
  10. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  11. deltacat/compute/compactor/repartition_session.py +4 -1
  12. deltacat/compute/compactor/steps/dedupe.py +9 -6
  13. deltacat/compute/compactor/steps/hash_bucket.py +24 -3
  14. deltacat/compute/compactor/steps/materialize.py +11 -6
  15. deltacat/compute/compactor/steps/repartition.py +22 -1
  16. deltacat/compute/compactor/utils/io.py +40 -23
  17. deltacat/compute/compactor/utils/sort_key.py +5 -0
  18. deltacat/compute/compactor/utils/system_columns.py +43 -0
  19. deltacat/compute/compactor_v2/compaction_session.py +509 -0
  20. deltacat/compute/compactor_v2/constants.py +37 -0
  21. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  22. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  23. deltacat/compute/compactor_v2/model/merge_input.py +143 -0
  24. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  25. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  26. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  27. deltacat/compute/compactor_v2/steps/merge.py +469 -0
  28. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
  30. deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
  31. deltacat/compute/compactor_v2/utils/io.py +152 -0
  32. deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
  33. deltacat/compute/compactor_v2/utils/task_options.py +221 -0
  34. deltacat/compute/metastats/meta_stats.py +4 -2
  35. deltacat/compute/metastats/stats.py +1 -0
  36. deltacat/compute/metastats/utils/io.py +4 -0
  37. deltacat/compute/stats/utils/io.py +20 -5
  38. deltacat/exceptions.py +4 -0
  39. deltacat/io/memcached_object_store.py +37 -14
  40. deltacat/logs.py +4 -3
  41. deltacat/storage/interface.py +8 -1
  42. deltacat/storage/model/types.py +2 -1
  43. deltacat/tests/aws/test_clients.py +16 -3
  44. deltacat/tests/compute/__init__.py +0 -0
  45. deltacat/tests/compute/common.py +96 -0
  46. deltacat/tests/compute/compactor/__init__.py +0 -0
  47. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  48. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
  49. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  50. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  51. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  52. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  53. deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
  54. deltacat/tests/compute/testcases.py +395 -0
  55. deltacat/tests/io/test_memcached_object_store.py +5 -4
  56. deltacat/tests/local_deltacat_storage/__init__.py +62 -19
  57. deltacat/tests/test_utils/pyarrow.py +49 -0
  58. deltacat/tests/test_utils/utils.py +13 -0
  59. deltacat/tests/utils/data/__init__.py +0 -0
  60. deltacat/tests/utils/test_daft.py +76 -0
  61. deltacat/tests/utils/test_pyarrow.py +133 -0
  62. deltacat/tests/utils/test_resources.py +23 -20
  63. deltacat/types/media.py +1 -0
  64. deltacat/types/partial_download.py +83 -0
  65. deltacat/types/tables.py +6 -0
  66. deltacat/utils/arguments.py +25 -0
  67. deltacat/utils/daft.py +87 -0
  68. deltacat/utils/placement.py +20 -3
  69. deltacat/utils/pyarrow.py +218 -1
  70. deltacat/utils/ray_utils/concurrency.py +26 -1
  71. deltacat/utils/resources.py +72 -1
  72. deltacat/utils/s3fs.py +21 -0
  73. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
  74. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
  75. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
  76. /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
  77. /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
  78. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
  79. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@ import functools
5
5
  import logging
6
6
  import os
7
7
  import pathlib
8
- from typing import Dict, List, Optional, Set
8
+ from typing import Any, Dict, List, Optional, Set
9
9
 
10
10
  import ray
11
11
  from ray.types import ObjectRef
@@ -118,10 +118,12 @@ def collect_from_partition(
118
118
  stat_results_s3_bucket: Optional[str] = None,
119
119
  metastats_results_s3_bucket: Optional[str] = None,
120
120
  deltacat_storage=unimplemented_deltacat_storage,
121
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
121
122
  *args,
122
123
  **kwargs,
123
124
  ) -> ObjectRef[Dict[int, DeltaStats]]:
124
-
125
+ if deltacat_storage_kwargs is None:
126
+ deltacat_storage_kwargs = {}
125
127
  if not columns:
126
128
  columns = deltacat_storage.get_table_version_column_names(
127
129
  source_partition_locator.namespace,
@@ -33,6 +33,7 @@ def start_stats_collection(
33
33
  stat_results_s3_bucket: Optional[str] = None,
34
34
  metastats_results_s3_bucket: Optional[str] = None,
35
35
  deltacat_storage=unimplemented_deltacat_storage,
36
+ **kwargs,
36
37
  ) -> Dict[str, List[DeltaStats]]:
37
38
  """Collects statistics on deltas, given a set of delta stream position ranges.
38
39
  Example:
@@ -171,6 +171,7 @@ def collect_stats_by_columns(
171
171
  delta_annotated: DeltaAnnotated,
172
172
  columns_to_compute: Optional[List[str]] = None,
173
173
  deltacat_storage=unimplemented_deltacat_storage,
174
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
174
175
  ) -> Dict[str, Any]:
175
176
  """Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
176
177
 
@@ -182,6 +183,8 @@ def collect_stats_by_columns(
182
183
  Returns:
183
184
  A delta wide stats container
184
185
  """
186
+ if deltacat_storage_kwargs is None:
187
+ deltacat_storage_kwargs = {}
185
188
  total_tables_size = 0
186
189
 
187
190
  # Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
@@ -198,6 +201,7 @@ def collect_stats_by_columns(
198
201
  TableType.PYARROW,
199
202
  columns_to_compute,
200
203
  equivalent_table_types="uncompacted",
204
+ **deltacat_storage_kwargs,
201
205
  )
202
206
  )
203
207
  assert isinstance(entry_pyarrow_table, pyarrow.Table), (
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
- from typing import Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional
4
4
 
5
5
  import pyarrow
6
6
  import ray
@@ -83,6 +83,7 @@ def get_delta_stats(
83
83
  delta_locator: DeltaLocator,
84
84
  columns: Optional[List[str]] = None,
85
85
  deltacat_storage=unimplemented_deltacat_storage,
86
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
86
87
  ) -> DeltaStats:
87
88
  """Ray distributed task to compute and collect stats for a requested delta.
88
89
  If no columns are requested, stats will be computed for all columns.
@@ -93,10 +94,15 @@ def get_delta_stats(
93
94
  Returns:
94
95
  A delta wide stats container
95
96
  """
96
-
97
- manifest = deltacat_storage.get_delta_manifest(delta_locator)
97
+ if deltacat_storage_kwargs is None:
98
+ deltacat_storage_kwargs = {}
99
+ manifest = deltacat_storage.get_delta_manifest(
100
+ delta_locator, **deltacat_storage_kwargs
101
+ )
98
102
  delta = Delta.of(delta_locator, None, None, None, manifest)
99
- return _collect_stats_by_columns(delta, columns, deltacat_storage)
103
+ return _collect_stats_by_columns(
104
+ delta, columns, deltacat_storage, deltacat_storage_kwargs
105
+ )
100
106
 
101
107
 
102
108
  @ray.remote
@@ -105,6 +111,7 @@ def get_deltas_from_range(
105
111
  start_position_inclusive: DeltaRange,
106
112
  end_position_inclusive: DeltaRange,
107
113
  deltacat_storage=unimplemented_deltacat_storage,
114
+ **kwargs,
108
115
  ) -> List[Delta]:
109
116
  """Looks up deltas in the specified partition using Ray, given both starting and ending delta stream positions.
110
117
 
@@ -137,6 +144,7 @@ def get_deltas_from_range(
137
144
  end_position_inclusive,
138
145
  ascending_order=True,
139
146
  include_manifest=False,
147
+ **kwargs,
140
148
  )
141
149
  return deltas_list_result.all_items()
142
150
 
@@ -145,6 +153,7 @@ def _collect_stats_by_columns(
145
153
  delta: Delta,
146
154
  columns_to_compute: Optional[List[str]] = None,
147
155
  deltacat_storage=unimplemented_deltacat_storage,
156
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
148
157
  ) -> DeltaStats:
149
158
  """Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
150
159
  Args:
@@ -154,6 +163,8 @@ def _collect_stats_by_columns(
154
163
  Returns:
155
164
  A delta wide stats container
156
165
  """
166
+ if deltacat_storage_kwargs is None:
167
+ deltacat_storage_kwargs = {}
157
168
  assert (
158
169
  delta.manifest is not None
159
170
  ), f"Manifest should not be missing from delta for stats calculation: {delta}"
@@ -167,7 +178,11 @@ def _collect_stats_by_columns(
167
178
  for file_idx, manifest in enumerate(delta.manifest.entries):
168
179
  entry_pyarrow_table: LocalTable = (
169
180
  deltacat_storage.download_delta_manifest_entry(
170
- delta, file_idx, TableType.PYARROW, columns_to_compute
181
+ delta,
182
+ file_idx,
183
+ TableType.PYARROW,
184
+ columns_to_compute,
185
+ **deltacat_storage_kwargs,
171
186
  )
172
187
  )
173
188
  assert isinstance(entry_pyarrow_table, pyarrow.Table), (
deltacat/exceptions.py CHANGED
@@ -8,3 +8,7 @@ class NonRetryableError(Exception):
8
8
 
9
9
  class ConcurrentModificationError(Exception):
10
10
  pass
11
+
12
+
13
+ class ValidationError(NonRetryableError):
14
+ pass
@@ -3,13 +3,14 @@ from ray import cloudpickle
3
3
  from collections import defaultdict
4
4
  import time
5
5
  from deltacat.io.object_store import IObjectStore
6
- from typing import Any, List
6
+ from typing import Any, List, Optional
7
7
  from deltacat import logs
8
8
  import uuid
9
9
  import socket
10
10
  from pymemcache.client.base import Client
11
11
  from pymemcache.client.retrying import RetryingClient
12
12
  from pymemcache.exceptions import MemcacheUnexpectedCloseError
13
+ from pymemcache.client.rendezvous import RendezvousHash
13
14
 
14
15
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
16
 
@@ -19,36 +20,46 @@ class MemcachedObjectStore(IObjectStore):
19
20
  An implementation of object store that uses Memcached.
20
21
  """
21
22
 
22
- def __init__(self, port=11212) -> None:
23
+ def __init__(
24
+ self, storage_node_ips: Optional[List[str]] = None, port: Optional[int] = 11212
25
+ ) -> None:
23
26
  self.client_cache = {}
24
27
  self.current_ip = None
25
28
  self.SEPARATOR = "_"
26
29
  self.port = port
30
+ self.storage_node_ips = storage_node_ips
31
+ self.hasher = None
27
32
  super().__init__()
28
33
 
34
+ def initialize_hasher(self):
35
+ if not self.hasher and self.storage_node_ips:
36
+ self.hasher = RendezvousHash()
37
+ for n in self.storage_node_ips:
38
+ self.hasher.add_node(n)
39
+
29
40
  def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
30
- input = {}
41
+ input = defaultdict(dict)
31
42
  result = []
32
- current_ip = self._get_current_ip()
33
43
  for obj in objects:
34
44
  serialized = cloudpickle.dumps(obj)
35
45
  uid = uuid.uuid4()
36
- ref = self._create_ref(uid, current_ip)
37
- input[uid.__str__()] = serialized
46
+ create_ref_ip = self._get_create_ref_ip(uid.__str__())
47
+ ref = self._create_ref(uid, create_ref_ip)
48
+ input[create_ref_ip][uid.__str__()] = serialized
38
49
  result.append(ref)
39
-
40
- client = self._get_client_by_ip(current_ip)
41
- if client.set_many(input, noreply=False):
42
- raise RuntimeError("Unable to write few keys to cache")
50
+ for create_ref_ip, uid_to_object in input.items():
51
+ client = self._get_client_by_ip(create_ref_ip)
52
+ if client.set_many(uid_to_object, noreply=False):
53
+ raise RuntimeError("Unable to write few keys to cache")
43
54
 
44
55
  return result
45
56
 
46
57
  def put(self, obj: object, *args, **kwargs) -> Any:
47
58
  serialized = cloudpickle.dumps(obj)
48
59
  uid = uuid.uuid4()
49
- current_ip = self._get_current_ip()
50
- ref = self._create_ref(uid, current_ip)
51
- client = self._get_client_by_ip(current_ip)
60
+ create_ref_ip = self._get_create_ref_ip(uid.__str__())
61
+ ref = self._create_ref(uid, create_ref_ip)
62
+ client = self._get_client_by_ip(create_ref_ip)
52
63
 
53
64
  if client.set(uid.__str__(), serialized):
54
65
  return ref
@@ -99,6 +110,18 @@ class MemcachedObjectStore(IObjectStore):
99
110
  def _create_ref(self, uid, ip) -> str:
100
111
  return f"{uid}{self.SEPARATOR}{ip}"
101
112
 
113
+ def _get_storage_node_ip(self, key: str):
114
+ self.initialize_hasher()
115
+ storage_node_ip = self.hasher.get_node(key)
116
+ return storage_node_ip
117
+
118
+ def _get_create_ref_ip(self, uid: str):
119
+ if self.storage_node_ips:
120
+ create_ref_ip = self._get_storage_node_ip(uid)
121
+ else:
122
+ create_ref_ip = self._get_current_ip()
123
+ return create_ref_ip
124
+
102
125
  def _get_client_by_ip(self, ip_address: str):
103
126
  if ip_address in self.client_cache:
104
127
  return self.client_cache[ip_address]
@@ -108,7 +131,7 @@ class MemcachedObjectStore(IObjectStore):
108
131
  base_client,
109
132
  attempts=3,
110
133
  retry_delay=0.01,
111
- retry_for=[MemcacheUnexpectedCloseError],
134
+ retry_for=[MemcacheUnexpectedCloseError, ConnectionResetError],
112
135
  )
113
136
 
114
137
  self.client_cache[ip_address] = client
deltacat/logs.py CHANGED
@@ -143,9 +143,10 @@ def _configure_logger(
143
143
  log_dir, log_base_file_name, primary_log_level
144
144
  )
145
145
  _add_logger_handler(logger, handler)
146
- ray_runtime_ctx = ray.get_runtime_context()
147
- if ray_runtime_ctx.worker.connected:
148
- logger = RayRuntimeContextLoggerAdapter(logger, ray_runtime_ctx)
146
+ if ray.is_initialized():
147
+ ray_runtime_ctx = ray.get_runtime_context()
148
+ if ray_runtime_ctx.worker.connected:
149
+ logger = RayRuntimeContextLoggerAdapter(logger, ray_runtime_ctx)
149
150
 
150
151
  return logger
151
152
 
@@ -21,6 +21,7 @@ from deltacat.storage import (
21
21
  Table,
22
22
  TableVersion,
23
23
  SortKey,
24
+ PartitionLocator,
24
25
  )
25
26
  from deltacat.types.media import ContentType, StorageType, TableType
26
27
  from deltacat.utils.common import ReadKwargsProvider
@@ -105,7 +106,13 @@ def list_deltas(
105
106
 
106
107
 
107
108
  def list_partition_deltas(
108
- partition: Partition, include_manifest: bool = False, *args, **kwargs
109
+ partition_like: Union[Partition, PartitionLocator],
110
+ first_stream_position: Optional[int] = None,
111
+ last_stream_position: Optional[int] = None,
112
+ ascending_order: bool = False,
113
+ include_manifest: bool = False,
114
+ *args,
115
+ **kwargs
109
116
  ) -> ListResult[Delta]:
110
117
  """
111
118
  Lists a page of deltas committed to the given partition.
@@ -1,6 +1,7 @@
1
1
  from enum import Enum
2
2
  from typing import List, Union, Any
3
3
 
4
+ from pyarrow.parquet import ParquetFile
4
5
  import numpy as np
5
6
  import pandas as pd
6
7
  import pyarrow as pa
@@ -8,7 +9,7 @@ import pkg_resources
8
9
  from ray.data._internal.arrow_block import ArrowRow
9
10
  from ray.data.dataset import Dataset
10
11
 
11
- LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray]
12
+ LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray, ParquetFile]
12
13
  LocalDataset = List[LocalTable]
13
14
  # Starting Ray 2.5.0, Dataset follows a strict mode (https://docs.ray.io/en/latest/data/faq.html#migrating-to-strict-mode),
14
15
  # and generic annotation is removed. So add a version checker to determine whether to use the old or new definition.
@@ -1,7 +1,7 @@
1
1
  from unittest.mock import patch
2
2
  import unittest
3
- import json
4
3
  from http import HTTPStatus
4
+ import requests
5
5
 
6
6
  HAPPY_RESPONSE = {
7
7
  "AccessKeyId": "ASIA123456789",
@@ -20,7 +20,7 @@ class MockResponse:
20
20
  """
21
21
 
22
22
  def __init__(self, status_code: int, text: str, reason: str = "") -> None:
23
- self.status_code = status_code
23
+ self.status_code: requests.Response.status_code = status_code
24
24
  self.text = text
25
25
  self.reason = reason
26
26
 
@@ -55,7 +55,7 @@ class TestBlockUntilInstanceMetadataServiceReturnsSuccess(unittest.TestCase):
55
55
  )
56
56
 
57
57
  requests_mock.get.side_effect = [
58
- MockResponse(HTTPStatus.OK, json.dumps(HAPPY_RESPONSE)),
58
+ MockResponse(HTTPStatus.OK, "foo"),
59
59
  MockResponse(HTTPStatus.TOO_MANY_REQUESTS, "foo"),
60
60
  MockResponse(HTTPStatus.INTERNAL_SERVER_ERROR, "foo"),
61
61
  MockResponse(HTTPStatus.NOT_IMPLEMENTED, "bar"),
@@ -65,3 +65,16 @@ class TestBlockUntilInstanceMetadataServiceReturnsSuccess(unittest.TestCase):
65
65
  self.assertEqual(
66
66
  block_until_instance_metadata_service_returns_success().status_code, 200
67
67
  )
68
+
69
+ @patch("deltacat.aws.clients.requests")
70
+ def test_retrying_status_on_shortlist_returns_early(self, requests_mock):
71
+ from deltacat.aws.clients import (
72
+ block_until_instance_metadata_service_returns_success,
73
+ )
74
+
75
+ requests_mock.get.side_effect = [
76
+ MockResponse(HTTPStatus.FORBIDDEN, "foo"),
77
+ ]
78
+ self.assertEqual(
79
+ block_until_instance_metadata_service_returns_success().status_code, 403
80
+ )
File without changes
@@ -0,0 +1,96 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+ from enum import Enum
4
+ from typing import List
5
+ import datetime as dt
6
+ from datetime import timezone
7
+
8
+ TEST_S3_RCF_BUCKET_NAME = "test-compaction-artifacts-bucket"
9
+ # REBASE src = spark compacted table to create an initial version of ray compacted table
10
+ BASE_TEST_SOURCE_NAMESPACE = "source_test_namespace"
11
+ BASE_TEST_SOURCE_TABLE_NAME = "test_table"
12
+ BASE_TEST_SOURCE_TABLE_VERSION = "1"
13
+
14
+ BASE_TEST_DESTINATION_NAMESPACE = "destination_test_namespace"
15
+ BASE_TEST_DESTINATION_TABLE_NAME = "destination_test_table_RAY"
16
+ BASE_TEST_DESTINATION_TABLE_VERSION = "1"
17
+
18
+ HASH_BUCKET_COUNT: int = 3
19
+
20
+ MAX_RECORDS_PER_FILE: int = 1
21
+
22
+ UTC_ISO_8601_FORMAT_WITHOUT_MILLIS = "%Y-%m-%dT%H:%M:%SZ" # '2018-09-05T14:09:03Z'
23
+
24
+
25
+ class PartitionKeyType(str, Enum):
26
+ INT = "int"
27
+ STRING = "string"
28
+ TIMESTAMP = "timestamp"
29
+
30
+
31
+ class PartitionKey(dict):
32
+ @staticmethod
33
+ def of(key_name: str, key_type: PartitionKeyType) -> PartitionKey:
34
+ return PartitionKey({"keyName": key_name, "keyType": key_type.value})
35
+
36
+ @property
37
+ def key_name(self) -> str:
38
+ return self["keyName"]
39
+
40
+ @property
41
+ def key_type(self) -> PartitionKeyType:
42
+ key_type = self["keyType"]
43
+ return None if key_type is None else PartitionKeyType(key_type)
44
+
45
+
46
+ def setup_sort_and_partition_keys(sort_keys_param, partition_keys_param):
47
+ from deltacat.storage.model.sort_key import SortKey
48
+
49
+ sort_keys, partition_keys = None, None
50
+ if sort_keys_param is not None:
51
+ sort_keys = [SortKey.of(sort_key["key_name"]) for sort_key in sort_keys_param]
52
+ if partition_keys_param is not None:
53
+ partition_keys = [
54
+ PartitionKey.of(
55
+ partition_key["key_name"], PartitionKeyType(partition_key["key_type"])
56
+ )
57
+ for partition_key in partition_keys_param
58
+ ]
59
+ return sort_keys, partition_keys
60
+
61
+
62
+ def offer_iso8601_timestamp_list(
63
+ periods: int,
64
+ unit_of_time: str,
65
+ end_time=dt.datetime(2023, 5, 3, 10, 0, 0, 0, tzinfo=timezone.utc),
66
+ ) -> List[str]:
67
+ """
68
+ Returns a list of ISO 8601 timestamps, each periods units of time before the start time.
69
+
70
+ Args:
71
+ periods: The number of timestamps to return.
72
+ unit_of_time: The unit of time to use for the timestamps. Must be one of "seconds", "minutes", "hours", "days", or "weeks".
73
+ end_time: The end time for the timestamps. Defaults to 2023-05-03T10:00:00Z.
74
+
75
+ Returns:
76
+ A list of ISO 8601 timestamps, each periods units of time before the start time.
77
+
78
+ Raises:
79
+ ValueError: If the unit_of_time argument is not one of "seconds", "minutes", "hours", "days", or "weeks".
80
+ """
81
+ import datetime as dt
82
+
83
+ acceptable_units_of_time = ["seconds", "minutes", "hours", "days", "weeks"]
84
+ if unit_of_time not in acceptable_units_of_time:
85
+ raise ValueError(
86
+ f"unit_of_time {unit_of_time} is not supported. Please use one of these time units: {acceptable_units_of_time}"
87
+ )
88
+ res = []
89
+ for i in range(periods):
90
+ kwarg = {unit_of_time: i}
91
+ res.append(
92
+ (end_time - dt.timedelta(**kwarg)).strftime(
93
+ UTC_ISO_8601_FORMAT_WITHOUT_MILLIS
94
+ )
95
+ )
96
+ return res
File without changes
File without changes
@@ -49,8 +49,10 @@ class TestRepartitionRange(unittest.TestCase):
49
49
  self.destination_partition: PartitionLocator = MagicMock()
50
50
  self.repartition_args = {"column": "last_updated", "ranges": [1678665487112747]}
51
51
  self.max_records_per_output_file = 2
52
+ self.s3_table_writer_kwargs = {}
52
53
  self.repartitioned_file_content_type = ContentType.PARQUET
53
54
  self.deltacat_storage = MagicMock()
55
+ self.deltacat_storage_kwargs = MagicMock()
54
56
 
55
57
  def test_repartition_range(self):
56
58
  result = repartition_range(
@@ -58,8 +60,10 @@ class TestRepartitionRange(unittest.TestCase):
58
60
  self.destination_partition,
59
61
  self.repartition_args,
60
62
  self.max_records_per_output_file,
63
+ self.s3_table_writer_kwargs,
61
64
  self.repartitioned_file_content_type,
62
65
  self.deltacat_storage,
66
+ self.deltacat_storage_kwargs,
63
67
  )
64
68
  # Assert that a RepartitionResult object is returned
65
69
  self.assertIsInstance(result, RepartitionResult)
@@ -83,8 +87,10 @@ class TestRepartitionRange(unittest.TestCase):
83
87
  self.destination_partition,
84
88
  self.repartition_args,
85
89
  self.max_records_per_output_file,
90
+ self.s3_table_writer_kwargs,
86
91
  self.repartitioned_file_content_type,
87
92
  self.deltacat_storage,
93
+ self.deltacat_storage_kwargs,
88
94
  )
89
95
 
90
96
  def test_empty_ranges(self):
@@ -95,8 +101,10 @@ class TestRepartitionRange(unittest.TestCase):
95
101
  self.destination_partition,
96
102
  self.repartition_args,
97
103
  self.max_records_per_output_file,
104
+ self.s3_table_writer_kwargs,
98
105
  self.repartitioned_file_content_type,
99
106
  self.deltacat_storage,
107
+ self.deltacat_storage_kwargs,
100
108
  )
101
109
 
102
110
  def test_one_value_in_ranges(self):
@@ -106,8 +114,10 @@ class TestRepartitionRange(unittest.TestCase):
106
114
  self.destination_partition,
107
115
  self.repartition_args,
108
116
  self.max_records_per_output_file,
117
+ self.s3_table_writer_kwargs,
109
118
  self.repartitioned_file_content_type,
110
119
  self.deltacat_storage,
120
+ self.deltacat_storage_kwargs,
111
121
  )
112
122
  self.assertEqual(len(result.range_deltas), 2)
113
123
 
@@ -118,8 +128,10 @@ class TestRepartitionRange(unittest.TestCase):
118
128
  self.destination_partition,
119
129
  self.repartition_args,
120
130
  self.max_records_per_output_file,
131
+ self.s3_table_writer_kwargs,
121
132
  self.repartitioned_file_content_type,
122
133
  self.deltacat_storage,
134
+ self.deltacat_storage_kwargs,
123
135
  )
124
136
  self.assertEqual(len(result.range_deltas), 3)
125
137
 
@@ -131,8 +143,10 @@ class TestRepartitionRange(unittest.TestCase):
131
143
  self.destination_partition,
132
144
  self.repartition_args,
133
145
  self.max_records_per_output_file,
146
+ self.s3_table_writer_kwargs,
134
147
  self.repartitioned_file_content_type,
135
148
  self.deltacat_storage,
149
+ self.deltacat_storage_kwargs,
136
150
  )
137
151
  self.assertLess(len(result.range_deltas), 2)
138
152
 
@@ -144,8 +158,10 @@ class TestRepartitionRange(unittest.TestCase):
144
158
  self.destination_partition,
145
159
  self.repartition_args,
146
160
  self.max_records_per_output_file,
161
+ self.s3_table_writer_kwargs,
147
162
  self.repartitioned_file_content_type,
148
163
  self.deltacat_storage,
164
+ self.deltacat_storage_kwargs,
149
165
  )
150
166
 
151
167
  def test_unsorted_ranges(self):
@@ -159,8 +175,10 @@ class TestRepartitionRange(unittest.TestCase):
159
175
  self.destination_partition,
160
176
  self.repartition_args,
161
177
  self.max_records_per_output_file,
178
+ self.s3_table_writer_kwargs,
162
179
  self.repartitioned_file_content_type,
163
180
  self.deltacat_storage,
181
+ self.deltacat_storage_kwargs,
164
182
  )
165
183
  self.assertEqual(len(result.range_deltas), 4)
166
184
 
@@ -171,22 +189,28 @@ class TestRepartitionRange(unittest.TestCase):
171
189
  self.destination_partition,
172
190
  self.repartition_args,
173
191
  self.max_records_per_output_file,
192
+ self.s3_table_writer_kwargs,
174
193
  self.repartitioned_file_content_type,
175
194
  self.deltacat_storage,
195
+ self.deltacat_storage_kwargs,
176
196
  )
177
197
  self.assertEqual(len(result.range_deltas), 2)
178
198
 
179
199
  def test_ranges_with_inf(self):
180
200
  self.repartition_args["ranges"] = [1678665487112747, float("inf")]
181
- result = repartition_range(
182
- self.tables,
183
- self.destination_partition,
184
- self.repartition_args,
185
- self.max_records_per_output_file,
186
- self.repartitioned_file_content_type,
187
- self.deltacat_storage,
201
+
202
+ self.assertRaises(
203
+ pa.lib.ArrowInvalid,
204
+ lambda: repartition_range(
205
+ self.tables,
206
+ self.destination_partition,
207
+ self.repartition_args,
208
+ self.max_records_per_output_file,
209
+ self.s3_table_writer_kwargs,
210
+ self.repartitioned_file_content_type,
211
+ self.deltacat_storage,
212
+ ),
188
213
  )
189
- self.assertEqual(len(result.range_deltas), 2)
190
214
 
191
215
  def test_null_rows_are_not_dropped(self):
192
216
  # Add null value to the first table
@@ -209,8 +233,10 @@ class TestRepartitionRange(unittest.TestCase):
209
233
  self.destination_partition,
210
234
  self.repartition_args,
211
235
  self.max_records_per_output_file,
236
+ self.s3_table_writer_kwargs,
212
237
  self.repartitioned_file_content_type,
213
238
  self.deltacat_storage,
239
+ self.deltacat_storage_kwargs,
214
240
  )
215
241
 
216
242
  # Assuming range_deltas is a list of DataFrames,
File without changes