deltacat 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/redshift/__init__.py +4 -0
  3. deltacat/aws/redshift/model/manifest.py +93 -1
  4. deltacat/aws/s3u.py +250 -111
  5. deltacat/catalog/default_catalog_impl/__init__.py +369 -0
  6. deltacat/compute/compactor_v2/compaction_session.py +175 -152
  7. deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
  8. deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
  9. deltacat/compute/compactor_v2/model/merge_input.py +8 -24
  10. deltacat/compute/compactor_v2/model/merge_result.py +1 -0
  11. deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
  12. deltacat/compute/compactor_v2/steps/merge.py +106 -171
  13. deltacat/compute/compactor_v2/utils/delta.py +97 -0
  14. deltacat/compute/compactor_v2/utils/merge.py +126 -0
  15. deltacat/compute/compactor_v2/utils/task_options.py +47 -4
  16. deltacat/compute/merge_on_read/__init__.py +4 -0
  17. deltacat/compute/merge_on_read/daft.py +40 -0
  18. deltacat/compute/merge_on_read/model/__init__.py +0 -0
  19. deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
  20. deltacat/compute/merge_on_read/utils/__init__.py +0 -0
  21. deltacat/compute/merge_on_read/utils/delta.py +42 -0
  22. deltacat/storage/interface.py +10 -2
  23. deltacat/storage/model/types.py +3 -11
  24. deltacat/tests/catalog/__init__.py +0 -0
  25. deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
  26. deltacat/tests/compute/compact_partition_test_cases.py +126 -1
  27. deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
  28. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
  29. deltacat/tests/local_deltacat_storage/__init__.py +19 -2
  30. deltacat/tests/test_utils/pyarrow.py +33 -14
  31. deltacat/tests/utils/test_daft.py +42 -2
  32. deltacat/types/media.py +5 -0
  33. deltacat/types/tables.py +7 -1
  34. deltacat/utils/daft.py +78 -13
  35. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
  36. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/RECORD +39 -27
  37. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
  38. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
  39. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,126 @@
1
+ import logging
2
+
3
+ from deltacat.compute.compactor.model.compact_partition_params import (
4
+ CompactPartitionParams,
5
+ )
6
+ from deltacat.compute.compactor_v2.model.merge_file_group import (
7
+ LocalMergeFileGroupsProvider,
8
+ )
9
+ from deltacat.compute.compactor_v2.model.merge_input import MergeInput
10
+ import pyarrow as pa
11
+ from deltacat import logs
12
+ from typing import List, Optional
13
+
14
+ from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
15
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
16
+ from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
17
+ from deltacat.compute.compactor import (
18
+ RoundCompletionInfo,
19
+ DeltaAnnotated,
20
+ )
21
+
22
+ from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
23
+
24
+ from deltacat.utils.performance import timed_invocation
25
+ from deltacat.storage import (
26
+ Partition,
27
+ )
28
+
29
+
30
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
31
+
32
+
33
+ def materialize(
34
+ input: MergeInput,
35
+ task_index: int,
36
+ compacted_tables: List[pa.Table],
37
+ ) -> MaterializeResult:
38
+ compacted_table = pa.concat_tables(compacted_tables)
39
+ if input.compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
40
+ # TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
41
+ # TODO (pdames): compare performance to pandas-native materialize path
42
+ df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
43
+ compacted_table = df
44
+ delta, stage_delta_time = timed_invocation(
45
+ input.deltacat_storage.stage_delta,
46
+ compacted_table,
47
+ input.write_to_partition,
48
+ max_records_per_entry=input.max_records_per_output_file,
49
+ content_type=input.compacted_file_content_type,
50
+ s3_table_writer_kwargs=input.s3_table_writer_kwargs,
51
+ **input.deltacat_storage_kwargs,
52
+ )
53
+ compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
54
+ compacted_table
55
+ )
56
+ logger.debug(
57
+ f"Time taken for materialize task"
58
+ f" to upload {len(compacted_table)} records"
59
+ f" of size {compacted_table_size} is: {stage_delta_time}s"
60
+ )
61
+ manifest = delta.manifest
62
+ manifest_records = manifest.meta.record_count
63
+ assert manifest_records == len(compacted_table), (
64
+ f"Unexpected Error: Materialized delta manifest record count "
65
+ f"({manifest_records}) does not equal compacted table record count "
66
+ f"({len(compacted_table)})"
67
+ )
68
+ materialize_result = MaterializeResult.of(
69
+ delta=delta,
70
+ task_index=task_index,
71
+ # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
72
+ # and in-memory-table-bytes instead of tight coupling to paBytes
73
+ pyarrow_write_result=PyArrowWriteResult.of(
74
+ len(manifest.entries),
75
+ TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
76
+ manifest.meta.content_length,
77
+ len(compacted_table),
78
+ ),
79
+ )
80
+ logger.info(f"Materialize result: {materialize_result}")
81
+ return materialize_result
82
+
83
+
84
+ def generate_local_merge_input(
85
+ params: CompactPartitionParams,
86
+ annotated_deltas: List[DeltaAnnotated],
87
+ compacted_partition: Partition,
88
+ round_completion_info: Optional[RoundCompletionInfo],
89
+ ):
90
+ """
91
+ Generates a merge input for local deltas that do not reside in the Ray object store and
92
+ have not been subject to the hash bucketing process.
93
+
94
+ Args:
95
+ params: parameters for compacting a partition
96
+ annotated_deltas: a list of annotated deltas
97
+ compacted_partition: the compacted partition to write to
98
+ round_completion_info: keeps track of high watermarks and other metadata from previous compaction rounds
99
+
100
+ Returns:
101
+ A MergeInput object
102
+
103
+ """
104
+
105
+ return MergeInput.of(
106
+ merge_file_groups_provider=LocalMergeFileGroupsProvider(
107
+ annotated_deltas,
108
+ read_kwargs_provider=params.read_kwargs_provider,
109
+ deltacat_storage=params.deltacat_storage,
110
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
111
+ ),
112
+ write_to_partition=compacted_partition,
113
+ compacted_file_content_type=params.compacted_file_content_type,
114
+ primary_keys=params.primary_keys,
115
+ sort_keys=params.sort_keys,
116
+ drop_duplicates=params.drop_duplicates,
117
+ max_records_per_output_file=params.records_per_compacted_file,
118
+ enable_profiler=params.enable_profiler,
119
+ metrics_config=params.metrics_config,
120
+ s3_table_writer_kwargs=params.s3_table_writer_kwargs,
121
+ read_kwargs_provider=params.read_kwargs_provider,
122
+ round_completion_info=round_completion_info,
123
+ object_store=params.object_store,
124
+ deltacat_storage=params.deltacat_storage,
125
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
126
+ )
@@ -1,5 +1,7 @@
1
1
  import botocore
2
+ import logging
2
3
  from typing import Dict, Optional, List, Tuple
4
+ from deltacat import logs
3
5
  from deltacat.types.media import ContentEncoding, ContentType
4
6
  from deltacat.types.partial_download import PartialParquetParameters
5
7
  from deltacat.storage import (
@@ -17,6 +19,8 @@ from deltacat.compute.compactor_v2.constants import (
17
19
  PARQUET_TO_PYARROW_INFLATION,
18
20
  )
19
21
 
22
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
23
+
20
24
 
21
25
  def _get_parquet_type_params_if_exist(
22
26
  entry: ManifestEntry,
@@ -133,11 +137,15 @@ def hash_bucket_resource_options_provider(
133
137
  ray_custom_resources: Optional[Dict] = None,
134
138
  **kwargs,
135
139
  ) -> Dict:
140
+ debug_memory_params = {"hash_bucket_task_index": index}
136
141
  size_bytes = 0.0
137
142
  num_rows = 0
138
143
  total_pk_size = 0
139
144
 
140
145
  if not item.manifest or not item.manifest.entries:
146
+ logger.debug(
147
+ f"[Hash bucket task {index}]: No manifest entries, skipping memory allocation calculation"
148
+ )
141
149
  return {"CPU": 0.01}
142
150
 
143
151
  for entry in item.manifest.entries:
@@ -162,12 +170,30 @@ def hash_bucket_resource_options_provider(
162
170
  else:
163
171
  total_pk_size += pk_size
164
172
 
165
- # total size + pk size + pk hash column + hash bucket index column
173
+ # total size + pk size + pyarrow-to-numpy conversion + pk hash column + hashlib inefficiency + hash bucket index column
166
174
  # Refer to hash_bucket step for more details.
167
- total_memory = size_bytes + total_pk_size + num_rows * 20 + num_rows * 4
175
+ total_memory = (
176
+ size_bytes
177
+ + total_pk_size
178
+ + total_pk_size
179
+ + num_rows * 20
180
+ + num_rows * 20
181
+ + num_rows * 4
182
+ )
183
+ debug_memory_params["size_bytes"] = size_bytes
184
+ debug_memory_params["num_rows"] = num_rows
185
+ debug_memory_params["total_pk_size"] = total_pk_size
186
+ debug_memory_params["total_memory"] = total_memory
187
+
188
+ debug_memory_params["previous_inflation"] = previous_inflation
189
+ debug_memory_params["average_record_size_bytes"] = average_record_size_bytes
168
190
 
169
191
  # Consider buffer
170
192
  total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
193
+ debug_memory_params["total_memory_with_buffer"] = total_memory
194
+ logger.debug(
195
+ f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}"
196
+ )
171
197
 
172
198
  return get_task_options(0.01, total_memory, ray_custom_resources)
173
199
 
@@ -186,10 +212,13 @@ def merge_resource_options_provider(
186
212
  deltacat_storage_kwargs: Optional[Dict] = {},
187
213
  **kwargs,
188
214
  ) -> Dict:
215
+ debug_memory_params = {"merge_task_index": index}
189
216
  hb_group_idx = item[0]
190
217
 
191
218
  data_size = hash_group_size_bytes.get(hb_group_idx, 0)
192
219
  num_rows = hash_group_num_rows.get(hb_group_idx, 0)
220
+ debug_memory_params["data_size_from_hash_group"] = data_size
221
+ debug_memory_params["num_rows_from_hash_group"] = num_rows
193
222
 
194
223
  # upper bound for pk size of incremental
195
224
  pk_size_bytes = data_size
@@ -205,10 +234,13 @@ def merge_resource_options_provider(
205
234
  round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
206
235
  / round_completion_info.compacted_pyarrow_write_result.file_bytes
207
236
  )
237
+ debug_memory_params["previous_inflation"] = previous_inflation
238
+
208
239
  average_record_size = (
209
240
  round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
210
241
  / round_completion_info.compacted_pyarrow_write_result.records
211
242
  )
243
+ debug_memory_params["average_record_size"] = average_record_size
212
244
 
213
245
  iterable = hash_group_index_to_hash_bucket_indices(
214
246
  hb_group_idx, round_completion_info.hash_bucket_count, num_hash_groups
@@ -247,16 +279,27 @@ def merge_resource_options_provider(
247
279
  else:
248
280
  pk_size_bytes += pk_size
249
281
 
250
- # total data downloaded + primary key hash column + primary key column
251
- # + dict size for merge + incremental index array size
282
+ # total data downloaded + primary key hash column + pyarrow-to-numpy conversion
283
+ # + primary key column + hashlib inefficiency + dict size for merge + incremental index array size
252
284
  total_memory = (
253
285
  data_size
254
286
  + pk_size_bytes
287
+ + pk_size_bytes
288
+ + num_rows * 20
255
289
  + num_rows * 20
256
290
  + num_rows * 20
257
291
  + incremental_index_array_size
258
292
  )
293
+ debug_memory_params["data_size"] = data_size
294
+ debug_memory_params["num_rows"] = num_rows
295
+ debug_memory_params["pk_size_bytes"] = pk_size_bytes
296
+ debug_memory_params["incremental_index_array_size"] = incremental_index_array_size
297
+ debug_memory_params["total_memory"] = total_memory
259
298
 
260
299
  total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
300
+ debug_memory_params["total_memory_with_buffer"] = total_memory
301
+ logger.debug(
302
+ f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}"
303
+ )
261
304
 
262
305
  return get_task_options(0.01, total_memory, ray_custom_resources)
@@ -0,0 +1,4 @@
1
+ from deltacat.types.media import DistributedDatasetType
2
+ from deltacat.compute.merge_on_read.daft import merge as daft_merge
3
+
4
+ MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE = {DistributedDatasetType.DAFT.value: daft_merge}
@@ -0,0 +1,40 @@
1
+ import logging
2
+ from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
3
+ from deltacat.storage.model.types import DistributedDataset
4
+ from deltacat.types.media import TableType, DistributedDatasetType
5
+ from deltacat.compute.merge_on_read.utils.delta import create_df_from_all_deltas
6
+ from deltacat import logs
7
+
8
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
9
+
10
+
11
+ def merge(params: MergeOnReadParams, **kwargs) -> DistributedDataset:
12
+ """
13
+ Merges the given deltas and returns the result as distributed dataframe.
14
+ It reads the deltas into the Daft dataframe and leverages operations supported
15
+ by Daft to perform an efficient merge using Ray cluster.
16
+
17
+ TODO(raghumdani): Perform actual merge.
18
+ """
19
+
20
+ delta_dfs = create_df_from_all_deltas(
21
+ deltas=params.deltas,
22
+ table_type=TableType.PYARROW,
23
+ distributed_dataset_type=DistributedDatasetType.DAFT,
24
+ reader_kwargs=params.reader_kwargs,
25
+ deltacat_storage=params.deltacat_storage,
26
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
27
+ **kwargs,
28
+ )
29
+
30
+ logger.info(f"Merging {len(delta_dfs)} delta dfs...")
31
+
32
+ # TODO: This code should be optimized from daft side
33
+ result = None
34
+ for df in delta_dfs:
35
+ if result is None:
36
+ result = df
37
+ else:
38
+ result = result.concat(df)
39
+
40
+ return result
File without changes
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+ from typing import Optional, Dict, List, Union, Any
3
+ from deltacat.storage import (
4
+ Delta,
5
+ DeltaLocator,
6
+ interface as unimplemented_deltacat_storage,
7
+ )
8
+
9
+
10
+ class MergeOnReadParams(dict):
11
+ """
12
+ This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
13
+ """
14
+
15
+ @staticmethod
16
+ def of(params: Optional[Dict]) -> MergeOnReadParams:
17
+ params = {} if params is None else params
18
+
19
+ result = MergeOnReadParams(params)
20
+ assert result.deltas is not None, "deltas is a required arg"
21
+
22
+ result.deltacat_storage = params.get(
23
+ "deltacat_storage", unimplemented_deltacat_storage
24
+ )
25
+ result.reader_kwargs = params.get("reader_kwargs", {})
26
+ result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
27
+
28
+ return result
29
+
30
+ @property
31
+ def deltas(self) -> List[Union[Delta, DeltaLocator]]:
32
+ """
33
+ The list of deltas to compact in-memory.
34
+ """
35
+ return self["deltas"]
36
+
37
+ @deltas.setter
38
+ def deltas(self, to_set: List[Union[Delta, DeltaLocator]]) -> None:
39
+ self["deltas"] = to_set
40
+
41
+ @property
42
+ def reader_kwargs(self) -> Dict[Any, Any]:
43
+ """
44
+ The key word arguments to be passed to the reader.
45
+ """
46
+ return self["reader_kwargs"]
47
+
48
+ @reader_kwargs.setter
49
+ def reader_kwargs(self, kwargs: Dict[Any, Any]) -> None:
50
+ self["reader_kwargs"] = kwargs
51
+
52
+ @property
53
+ def deltacat_storage(self) -> unimplemented_deltacat_storage:
54
+ return self["deltacat_storage"]
55
+
56
+ @deltacat_storage.setter
57
+ def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
58
+ self["deltacat_storage"] = storage
59
+
60
+ @property
61
+ def deltacat_storage_kwargs(self) -> dict:
62
+ return self["deltacat_storage_kwargs"]
63
+
64
+ @deltacat_storage_kwargs.setter
65
+ def deltacat_storage_kwargs(self, kwargs: dict) -> None:
66
+ self["deltacat_storage_kwargs"] = kwargs
File without changes
@@ -0,0 +1,42 @@
1
+ from typing import List, Dict, Any, Optional, Union
2
+ from deltacat.storage.model.delta import Delta, DeltaLocator
3
+ from deltacat.storage.model.types import DistributedDataset
4
+ from deltacat.storage import (
5
+ interface as unimplemented_deltacat_storage,
6
+ )
7
+ from deltacat.types.media import TableType, StorageType, DistributedDatasetType
8
+
9
+
10
+ def create_df_from_all_deltas(
11
+ deltas: List[Union[Delta, DeltaLocator]],
12
+ table_type: TableType,
13
+ distributed_dataset_type: DistributedDatasetType,
14
+ reader_kwargs: Optional[Dict[Any, Any]] = None,
15
+ deltacat_storage=unimplemented_deltacat_storage,
16
+ deltacat_storage_kwargs: Optional[Dict[Any, Any]] = None,
17
+ *args,
18
+ **kwargs
19
+ ) -> List[DistributedDataset]: # type: ignore
20
+ """
21
+ This method creates a distributed dataset for each delta and returns their references.
22
+ """
23
+
24
+ if reader_kwargs is None:
25
+ reader_kwargs = {}
26
+ if deltacat_storage_kwargs is None:
27
+ deltacat_storage_kwargs = {}
28
+
29
+ df_list = []
30
+
31
+ for delta in deltas:
32
+ df = deltacat_storage.download_delta(
33
+ delta_like=delta,
34
+ table_type=table_type,
35
+ distributed_dataset_type=distributed_dataset_type,
36
+ storage_type=StorageType.DISTRIBUTED,
37
+ **reader_kwargs,
38
+ **deltacat_storage_kwargs
39
+ )
40
+ df_list.append(df)
41
+
42
+ return df_list
@@ -23,7 +23,12 @@ from deltacat.storage import (
23
23
  SortKey,
24
24
  PartitionLocator,
25
25
  )
26
- from deltacat.types.media import ContentType, StorageType, TableType
26
+ from deltacat.types.media import (
27
+ ContentType,
28
+ StorageType,
29
+ TableType,
30
+ DistributedDatasetType,
31
+ )
27
32
  from deltacat.utils.common import ReadKwargsProvider
28
33
 
29
34
 
@@ -178,9 +183,10 @@ def download_delta(
178
183
  columns: Optional[List[str]] = None,
179
184
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
180
185
  ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
186
+ distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
181
187
  *args,
182
188
  **kwargs
183
- ) -> Union[LocalDataset, DistributedDataset]:
189
+ ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
184
190
  """
185
191
  Download the given delta or delta locator into either a list of
186
192
  tables resident in the local node's memory, or into a dataset distributed
@@ -205,6 +211,8 @@ def download_delta_manifest_entry(
205
211
  given delta or delta locator. If a delta is provided with a non-empty
206
212
  manifest, then the entry is downloaded from this manifest. Otherwise, the
207
213
  manifest is first retrieved then the given entry index downloaded.
214
+
215
+ NOTE: The entry will be downloaded in the current node's memory.
208
216
  """
209
217
  raise NotImplementedError("download_delta_manifest_entry not implemented")
210
218
 
@@ -1,24 +1,16 @@
1
1
  from enum import Enum
2
- from typing import List, Union, Any
2
+ from typing import List, Union
3
3
 
4
4
  from pyarrow.parquet import ParquetFile
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  import pyarrow as pa
8
- import pkg_resources
9
- from ray.data._internal.arrow_block import ArrowRow
10
8
  from ray.data.dataset import Dataset
9
+ from daft import DataFrame as DaftDataFrame
11
10
 
12
11
  LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray, ParquetFile]
13
12
  LocalDataset = List[LocalTable]
14
- # Starting Ray 2.5.0, Dataset follows a strict mode (https://docs.ray.io/en/latest/data/faq.html#migrating-to-strict-mode),
15
- # and generic annotation is removed. So add a version checker to determine whether to use the old or new definition.
16
- ray_version = pkg_resources.parse_version(pkg_resources.get_distribution("ray").version)
17
- change_version = pkg_resources.parse_version("2.5.0")
18
- if ray_version < change_version:
19
- DistributedDataset = Dataset[Union[ArrowRow, np.ndarray, Any]]
20
- else:
21
- DistributedDataset = Dataset
13
+ DistributedDataset = Union[Dataset, DaftDataFrame]
22
14
 
23
15
 
24
16
  class DeltaType(str, Enum):
File without changes
@@ -0,0 +1,98 @@
1
+ import unittest
2
+ import sqlite3
3
+ import ray
4
+ import os
5
+ import deltacat.tests.local_deltacat_storage as ds
6
+ from deltacat.utils.common import current_time_ms
7
+ from deltacat.tests.test_utils.pyarrow import (
8
+ create_delta_from_csv_file,
9
+ commit_delta_to_partition,
10
+ )
11
+ from deltacat.types.media import DistributedDatasetType, ContentType
12
+ from deltacat.catalog import default_catalog_impl as dc
13
+
14
+
15
+ class TestReadTable(unittest.TestCase):
16
+ READ_TABLE_NAMESPACE = "catalog_read_table_namespace"
17
+ LOCAL_CATALOG_NAME = "local_catalog"
18
+ DB_FILE_PATH = f"{current_time_ms()}.db"
19
+ SAMPLE_FILE_PATH = "deltacat/tests/catalog/data/sample_table.csv"
20
+
21
+ @classmethod
22
+ def setUpClass(cls):
23
+ ray.init(local_mode=True, ignore_reinit_error=True)
24
+
25
+ con = sqlite3.connect(cls.DB_FILE_PATH)
26
+ cur = con.cursor()
27
+ cls.kwargs = {
28
+ ds.SQLITE_CON_ARG: con,
29
+ ds.SQLITE_CUR_ARG: cur,
30
+ "supported_content_types": [ContentType.CSV],
31
+ }
32
+ cls.deltacat_storage_kwargs = {ds.DB_FILE_PATH_ARG: cls.DB_FILE_PATH}
33
+
34
+ super().setUpClass()
35
+
36
+ @classmethod
37
+ def doClassCleanups(cls) -> None:
38
+ os.remove(cls.DB_FILE_PATH)
39
+
40
+ def test_daft_distributed_read_sanity(self):
41
+ # setup
42
+ READ_TABLE_TABLE_NAME = "test_read_table"
43
+ create_delta_from_csv_file(
44
+ self.READ_TABLE_NAMESPACE,
45
+ [self.SAMPLE_FILE_PATH],
46
+ table_name=READ_TABLE_TABLE_NAME,
47
+ **self.kwargs,
48
+ )
49
+
50
+ dc.initialize(ds=ds)
51
+ df = dc.read_table(
52
+ table=READ_TABLE_TABLE_NAME,
53
+ namespace=self.READ_TABLE_NAMESPACE,
54
+ catalog=self.LOCAL_CATALOG_NAME,
55
+ distributed_dataset_type=DistributedDatasetType.DAFT,
56
+ deltacat_storage_kwargs=self.kwargs,
57
+ )
58
+
59
+ # verify
60
+ self.assertEqual(df.count_rows(), 6)
61
+ self.assertEqual(df.column_names, ["pk", "value"])
62
+
63
+ def test_daft_distributed_read_multiple_deltas(self):
64
+ # setup
65
+ READ_TABLE_TABLE_NAME = "test_read_table_2"
66
+ delta = create_delta_from_csv_file(
67
+ self.READ_TABLE_NAMESPACE,
68
+ [self.SAMPLE_FILE_PATH],
69
+ table_name=READ_TABLE_TABLE_NAME,
70
+ **self.kwargs,
71
+ )
72
+
73
+ partition = ds.get_partition(
74
+ delta.stream_locator, delta.partition_values, **self.kwargs
75
+ )
76
+
77
+ commit_delta_to_partition(
78
+ partition=partition, file_paths=[self.SAMPLE_FILE_PATH], **self.kwargs
79
+ )
80
+
81
+ # action
82
+ dc.initialize(ds=ds)
83
+ df = dc.read_table(
84
+ table=READ_TABLE_TABLE_NAME,
85
+ namespace=self.READ_TABLE_NAMESPACE,
86
+ catalog=self.LOCAL_CATALOG_NAME,
87
+ distributed_dataset_type=DistributedDatasetType.DAFT,
88
+ merge_on_read=False,
89
+ deltacat_storage_kwargs=self.kwargs,
90
+ )
91
+
92
+ # verify
93
+ self.assertEqual(
94
+ df.count_rows(),
95
+ 12,
96
+ "we expect twice as many" " columns as merge on read is disabled",
97
+ )
98
+ self.assertEqual(df.column_names, ["pk", "value"])