deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. deltacat/__init__.py +27 -6
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/conftest.py +1 -1
  5. deltacat/catalog/main/impl.py +12 -6
  6. deltacat/catalog/model/catalog.py +65 -47
  7. deltacat/catalog/model/properties.py +1 -3
  8. deltacat/compute/__init__.py +14 -0
  9. deltacat/compute/converter/constants.py +5 -0
  10. deltacat/compute/converter/converter_session.py +78 -36
  11. deltacat/compute/converter/model/convert_input.py +24 -4
  12. deltacat/compute/converter/model/convert_result.py +61 -0
  13. deltacat/compute/converter/model/converter_session_params.py +52 -10
  14. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  15. deltacat/compute/converter/steps/convert.py +84 -36
  16. deltacat/compute/converter/steps/dedupe.py +25 -4
  17. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  18. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  19. deltacat/compute/converter/utils/io.py +82 -11
  20. deltacat/compute/converter/utils/s3u.py +13 -4
  21. deltacat/compute/jobs/__init__.py +0 -0
  22. deltacat/compute/jobs/client.py +404 -0
  23. deltacat/constants.py +4 -4
  24. deltacat/daft/daft_scan.py +7 -3
  25. deltacat/daft/translator.py +126 -0
  26. deltacat/examples/basic_logging.py +5 -3
  27. deltacat/examples/hello_world.py +4 -2
  28. deltacat/examples/indexer/__init__.py +0 -0
  29. deltacat/examples/indexer/aws/__init__.py +0 -0
  30. deltacat/examples/indexer/gcp/__init__.py +0 -0
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +199 -0
  33. deltacat/io/__init__.py +13 -0
  34. deltacat/io/dataset/__init__.py +0 -0
  35. deltacat/io/dataset/deltacat_dataset.py +91 -0
  36. deltacat/io/datasink/__init__.py +0 -0
  37. deltacat/io/datasink/deltacat_datasink.py +207 -0
  38. deltacat/io/datasource/__init__.py +0 -0
  39. deltacat/io/datasource/deltacat_datasource.py +580 -0
  40. deltacat/io/reader/__init__.py +0 -0
  41. deltacat/io/reader/deltacat_read_api.py +172 -0
  42. deltacat/storage/__init__.py +2 -0
  43. deltacat/storage/model/expression/__init__.py +47 -0
  44. deltacat/storage/model/expression/expression.py +656 -0
  45. deltacat/storage/model/expression/visitor.py +248 -0
  46. deltacat/storage/model/metafile.py +74 -42
  47. deltacat/storage/model/scan/push_down.py +32 -5
  48. deltacat/storage/model/types.py +5 -3
  49. deltacat/storage/rivulet/__init__.py +4 -4
  50. deltacat/tests/_io/reader/__init__.py +0 -0
  51. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  52. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  53. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  54. deltacat/tests/storage/model/test_expression.py +327 -0
  55. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
  56. deltacat/tests/storage/rivulet/test_dataset.py +1 -1
  57. deltacat/tests/storage/rivulet/test_manifest.py +1 -1
  58. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
  59. deltacat/tests/test_deltacat_api.py +50 -9
  60. deltacat/types/media.py +141 -43
  61. deltacat/types/tables.py +35 -7
  62. deltacat/utils/daft.py +2 -2
  63. deltacat/utils/filesystem.py +39 -9
  64. deltacat/utils/polars.py +128 -0
  65. deltacat/utils/pyarrow.py +151 -15
  66. deltacat/utils/ray_utils/concurrency.py +1 -1
  67. deltacat/utils/ray_utils/runtime.py +56 -4
  68. deltacat/utils/url.py +1284 -0
  69. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
  70. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
  71. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
  72. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
  73. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,207 @@
1
+ import logging
2
+
3
+ from collections import OrderedDict
4
+ from typing import Dict, Any, Optional, List, Iterable
5
+
6
+ from ray.data import Datasink
7
+ from ray.data._internal.execution.interfaces import TaskContext
8
+ from ray.data.block import Block, BlockAccessor
9
+ from ray.data.datasource import WriteResult
10
+
11
+ from ray.data.datasource.filename_provider import (
12
+ FilenameProvider,
13
+ )
14
+
15
+ from deltacat import logs
16
+
17
+ from deltacat.constants import METAFILE_FORMAT_MSGPACK
18
+ from deltacat.storage import Metafile
19
+ from deltacat.io.datasource.deltacat_datasource import (
20
+ METAFILE_DATA_COLUMN_NAME,
21
+ METAFILE_TYPE_COLUMN_NAME,
22
+ )
23
+ from deltacat.utils.url import DeltaCatUrl, DeltaCatUrlWriter
24
+
25
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
26
+
27
+
28
+ class CapturingBlockWritePathProvider(FilenameProvider):
29
+ """Delegating block write path provider that saves an ordered dictionary of
30
+ input keyword arguments for every block write path returned."""
31
+
32
+ def __init__(
33
+ self,
34
+ block_write_path_provider: FilenameProvider,
35
+ base_path: Optional[str] = None,
36
+ ):
37
+ self.base_path = base_path
38
+ self.block_write_path_provider = block_write_path_provider
39
+ self.write_path_kwargs: Dict[str, Dict[str, Any]] = OrderedDict()
40
+
41
+ def get_filename_for_block(
42
+ self,
43
+ block: Any,
44
+ task_index: int,
45
+ block_index: int,
46
+ ) -> str:
47
+ if self.base_path is None:
48
+ raise ValueError(
49
+ "Base path must be provided to CapturingBlockWritePathProvider",
50
+ )
51
+ return self._get_write_path_for_block(
52
+ base_path=self.base_path,
53
+ block=block,
54
+ block_index=block_index,
55
+ )
56
+
57
+ def _get_write_path_for_block(
58
+ self,
59
+ base_path: str,
60
+ *args,
61
+ **kwargs,
62
+ ) -> str:
63
+ filename = self.block_write_path_provider.get_filename_for_block(
64
+ *args,
65
+ **kwargs,
66
+ )
67
+ write_path = f"{base_path}/{filename}"
68
+ kwargs["base_path"] = base_path
69
+ self.write_path_kwargs[write_path] = kwargs
70
+ return write_path
71
+
72
+
73
+ class DeltaCatWriteResult:
74
+ def __init__(self):
75
+ self.metadata = None
76
+ self.path = None
77
+ self.dataset_uuid = None
78
+ self.block_write_path_provider = None
79
+ self.content_type = None
80
+ self.content_encoding = None
81
+ self.filesystem = None
82
+
83
+
84
+ class DeltaCatDatasink(Datasink[List[Metafile]]):
85
+ def __init__(
86
+ self,
87
+ url: DeltaCatUrl,
88
+ *,
89
+ metadata_only: bool = False,
90
+ copy_on_write: Optional[bool] = False,
91
+ ):
92
+ self._url = url
93
+ self._metadata_only = metadata_only
94
+ self._copy_on_write = copy_on_write
95
+
96
+ def on_write_start(self) -> None:
97
+ pass
98
+
99
+ def write(
100
+ self,
101
+ blocks: Iterable[Block],
102
+ ctx: TaskContext,
103
+ ) -> List[Metafile]:
104
+ for block in blocks:
105
+ pa_table = BlockAccessor.for_block(block).to_arrow()
106
+ if (
107
+ METAFILE_DATA_COLUMN_NAME in pa_table.column_names
108
+ and METAFILE_TYPE_COLUMN_NAME in pa_table.column_names
109
+ ):
110
+ for pa_scalar in pa_table[METAFILE_DATA_COLUMN_NAME]:
111
+ metafile_msgpack_bytes = pa_scalar.as_py()
112
+ metafile = Metafile.deserialize(
113
+ serialized=metafile_msgpack_bytes,
114
+ meta_format=METAFILE_FORMAT_MSGPACK,
115
+ )
116
+ # TODO(pdames): Add `metafile` to writer as a kwarg instead
117
+ # of constructing a new URL with the metafile as input.
118
+ writer_url = DeltaCatUrlWriter(self._url, metafile=metafile)
119
+ # TODO(pdames): Run writes in order from catalog -> delta
120
+ # by truncating the URL down to just dc://{catalog-name}
121
+ # and rebuilding all path elements from there.
122
+ writer_url.write(metafile)
123
+ else:
124
+ raise NotImplementedError(
125
+ f"Expected {METAFILE_DATA_COLUMN_NAME} and "
126
+ f"{METAFILE_TYPE_COLUMN_NAME} columns in the input block, "
127
+ f"but found {pa_table.column_names}."
128
+ )
129
+
130
+ def on_write_complete(
131
+ self,
132
+ write_result: WriteResult[List[Metafile]],
133
+ ):
134
+ pass
135
+
136
+
137
+ """
138
+ def write(
139
+ self,
140
+ blocks: Iterable[Block],
141
+ ctx: TaskContext,
142
+ ) -> List[ObjectRef[DeltacatWriteResult]]:
143
+ paths, filesystem = resolve_paths_and_filesystem(
144
+ self.path,
145
+ self.filesystem,
146
+ )
147
+ assert len(paths) == 1, f"Expected 1 write path, found {len(paths)}."
148
+ path = paths[0]
149
+ write_results = super().write(blocks)
150
+ # append a summary of this write operation in the last write result
151
+ metadata = [BlockAccessor.for_block(_).get_metadata() for _ in blocks]
152
+ rwr = DeltacatWriteResult()
153
+ rwr.metadata = metadata
154
+ rwr.path = path
155
+ rwr.dataset_uuid = self.dataset_uuid
156
+ rwr.block_write_path_provider = self.filename_provider
157
+ rwr.content_type = ContentType.PARQUET.value
158
+ rwr.content_encoding = ContentEncoding.IDENTITY.value
159
+ rwr.filesystem = filesystem
160
+ rwr_obj_ref = ray.put(rwr)
161
+ write_results.append(rwr_obj_ref)
162
+ return write_results
163
+
164
+ def on_write_complete(self, write_results: List[Any], **kwargs) -> None:
165
+ # TODO (pdames): time latency of this operation - overall s3 write times
166
+ # are 2-3x pure read_parquet_fast() times
167
+ # restore the write operation summary from the last write result
168
+ result: DeltacatWriteResult = write_results[len(write_results) - 1]
169
+ write_path_args = result.block_write_path_provider.write_path_kwargs
170
+ blocks_written = len(write_path_args)
171
+ expected_blocks_written = len(result.metadata)
172
+ # TODO(pdames): Corner cases where mismatch is expected? Emply blocks?
173
+ # Blocks filtered/split/merged to more/less write paths?
174
+ assert blocks_written == expected_blocks_written, (
175
+ f"Dataset write result validation failed. Found "
176
+ f"{blocks_written}/{expected_blocks_written} Dataset blocks "
177
+ f"written. Refusing to commit DeltaCAT Manifest."
178
+ )
179
+ manifest_entries = ManifestEntryList()
180
+ for block_idx, path in enumerate(write_path_args.keys()):
181
+ file_info = result.filesystem.get_file_info(path)
182
+ if file_info.type == pyarrow.fs.FileType.File:
183
+ content_length = file_info.size
184
+ else:
185
+ raise FileNotFoundError(ENOENT, strerror(ENOENT), path)
186
+ num_rows = result.metadata[block_idx].num_rows
187
+ source_content_length = result.metadata[block_idx].size_bytes
188
+ manifest_entry_meta = ManifestMeta.of(
189
+ int(num_rows) if num_rows is not None else None,
190
+ int(content_length) if content_length is not None else None,
191
+ result.content_type,
192
+ result.content_encoding,
193
+ int(source_content_length) if source_content_length else None,
194
+ )
195
+ parsed_url = parse_s3_url(path)
196
+ manifest_entry = ManifestEntry.of(
197
+ parsed_url.url,
198
+ manifest_entry_meta,
199
+ )
200
+ manifest_entries.append(manifest_entry)
201
+ manifest = Manifest.of(manifest_entries)
202
+ manifest_path = f"{result.path}/manifest"
203
+ logger.debug(f"Write succeeded for Dataset ID: {result.dataset_uuid}")
204
+ with result.filesystem.open_output_stream(manifest_path) as f:
205
+ f.write(json.dumps(manifest).encode("utf-8"))
206
+ logger.debug(f"Manifest committed to: {manifest_path}")
207
+ """
File without changes