deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +2 -3
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -1
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
  40. deltacat/compute/compactor_v2/steps/merge.py +11 -80
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  45. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  46. deltacat/compute/converter/constants.py +4 -0
  47. deltacat/compute/converter/converter_session.py +143 -0
  48. deltacat/compute/converter/model/convert_input.py +69 -0
  49. deltacat/compute/converter/model/convert_input_files.py +61 -0
  50. deltacat/compute/converter/model/converter_session_params.py +99 -0
  51. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  52. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  53. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  54. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  55. deltacat/compute/converter/steps/__init__.py +0 -0
  56. deltacat/compute/converter/steps/convert.py +211 -0
  57. deltacat/compute/converter/steps/dedupe.py +60 -0
  58. deltacat/compute/converter/utils/__init__.py +0 -0
  59. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  60. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  61. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  62. deltacat/compute/converter/utils/io.py +43 -0
  63. deltacat/compute/converter/utils/s3u.py +133 -0
  64. deltacat/compute/resource_estimation/delta.py +1 -19
  65. deltacat/constants.py +47 -1
  66. deltacat/env.py +51 -0
  67. deltacat/examples/__init__.py +0 -0
  68. deltacat/examples/basic_logging.py +101 -0
  69. deltacat/examples/common/__init__.py +0 -0
  70. deltacat/examples/common/fixtures.py +15 -0
  71. deltacat/examples/hello_world.py +27 -0
  72. deltacat/examples/iceberg/__init__.py +0 -0
  73. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  74. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  75. deltacat/exceptions.py +51 -9
  76. deltacat/logs.py +4 -1
  77. deltacat/storage/__init__.py +118 -28
  78. deltacat/storage/iceberg/__init__.py +0 -0
  79. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  80. deltacat/storage/iceberg/impl.py +737 -0
  81. deltacat/storage/iceberg/model.py +709 -0
  82. deltacat/storage/interface.py +217 -134
  83. deltacat/storage/main/__init__.py +0 -0
  84. deltacat/storage/main/impl.py +2077 -0
  85. deltacat/storage/model/delta.py +118 -71
  86. deltacat/storage/model/interop.py +24 -0
  87. deltacat/storage/model/list_result.py +8 -0
  88. deltacat/storage/model/locator.py +93 -3
  89. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  90. deltacat/storage/model/metafile.py +1316 -0
  91. deltacat/storage/model/namespace.py +34 -18
  92. deltacat/storage/model/partition.py +362 -37
  93. deltacat/storage/model/scan/__init__.py +0 -0
  94. deltacat/storage/model/scan/push_down.py +19 -0
  95. deltacat/storage/model/scan/scan_plan.py +10 -0
  96. deltacat/storage/model/scan/scan_task.py +34 -0
  97. deltacat/storage/model/schema.py +892 -0
  98. deltacat/storage/model/shard.py +47 -0
  99. deltacat/storage/model/sort_key.py +170 -13
  100. deltacat/storage/model/stream.py +208 -80
  101. deltacat/storage/model/table.py +123 -29
  102. deltacat/storage/model/table_version.py +322 -46
  103. deltacat/storage/model/transaction.py +757 -0
  104. deltacat/storage/model/transform.py +198 -61
  105. deltacat/storage/model/types.py +111 -13
  106. deltacat/storage/rivulet/__init__.py +11 -0
  107. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  108. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  109. deltacat/storage/rivulet/dataset.py +744 -0
  110. deltacat/storage/rivulet/dataset_executor.py +87 -0
  111. deltacat/storage/rivulet/feather/__init__.py +5 -0
  112. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  113. deltacat/storage/rivulet/feather/serializer.py +35 -0
  114. deltacat/storage/rivulet/fs/__init__.py +0 -0
  115. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  116. deltacat/storage/rivulet/fs/file_store.py +130 -0
  117. deltacat/storage/rivulet/fs/input_file.py +76 -0
  118. deltacat/storage/rivulet/fs/output_file.py +86 -0
  119. deltacat/storage/rivulet/logical_plan.py +105 -0
  120. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  121. deltacat/storage/rivulet/metastore/delta.py +190 -0
  122. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  123. deltacat/storage/rivulet/metastore/sst.py +82 -0
  124. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  125. deltacat/storage/rivulet/mvp/Table.py +101 -0
  126. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  127. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  129. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  130. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  131. deltacat/storage/rivulet/reader/__init__.py +0 -0
  132. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  133. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  134. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  135. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  136. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  137. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  138. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  139. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  140. deltacat/storage/rivulet/schema/__init__.py +0 -0
  141. deltacat/storage/rivulet/schema/datatype.py +128 -0
  142. deltacat/storage/rivulet/schema/schema.py +251 -0
  143. deltacat/storage/rivulet/serializer.py +40 -0
  144. deltacat/storage/rivulet/serializer_factory.py +42 -0
  145. deltacat/storage/rivulet/writer/__init__.py +0 -0
  146. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  147. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  148. deltacat/tests/_io/__init__.py +1 -0
  149. deltacat/tests/catalog/test_catalogs.py +324 -0
  150. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  151. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  152. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  153. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  154. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  155. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  156. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  157. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  158. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  159. deltacat/tests/compute/conftest.py +75 -0
  160. deltacat/tests/compute/converter/__init__.py +0 -0
  161. deltacat/tests/compute/converter/conftest.py +80 -0
  162. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  163. deltacat/tests/compute/converter/utils.py +123 -0
  164. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  165. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  166. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  167. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  168. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  169. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  170. deltacat/tests/compute/test_util_common.py +19 -12
  171. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  172. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  173. deltacat/tests/storage/__init__.py +0 -0
  174. deltacat/tests/storage/conftest.py +25 -0
  175. deltacat/tests/storage/main/__init__.py +0 -0
  176. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  177. deltacat/tests/storage/model/__init__.py +0 -0
  178. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  179. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  180. deltacat/tests/storage/model/test_schema.py +308 -0
  181. deltacat/tests/storage/model/test_shard.py +22 -0
  182. deltacat/tests/storage/model/test_table_version.py +110 -0
  183. deltacat/tests/storage/model/test_transaction.py +308 -0
  184. deltacat/tests/storage/rivulet/__init__.py +0 -0
  185. deltacat/tests/storage/rivulet/conftest.py +149 -0
  186. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  187. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  188. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  189. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  190. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  191. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  192. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  193. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  194. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  195. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  197. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  198. deltacat/tests/test_deltacat_api.py +39 -0
  199. deltacat/tests/test_utils/filesystem.py +14 -0
  200. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  201. deltacat/tests/test_utils/pyarrow.py +8 -15
  202. deltacat/tests/test_utils/storage.py +266 -3
  203. deltacat/tests/utils/test_daft.py +3 -3
  204. deltacat/tests/utils/test_pyarrow.py +0 -432
  205. deltacat/types/partial_download.py +1 -1
  206. deltacat/types/tables.py +1 -1
  207. deltacat/utils/export.py +59 -0
  208. deltacat/utils/filesystem.py +320 -0
  209. deltacat/utils/metafile_locator.py +73 -0
  210. deltacat/utils/pyarrow.py +36 -183
  211. deltacat-2.0.dist-info/METADATA +65 -0
  212. deltacat-2.0.dist-info/RECORD +347 -0
  213. deltacat/aws/redshift/__init__.py +0 -19
  214. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  215. deltacat/io/dataset.py +0 -73
  216. deltacat/io/read_api.py +0 -143
  217. deltacat/storage/model/delete_parameters.py +0 -40
  218. deltacat/storage/model/partition_spec.py +0 -71
  219. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  220. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  221. deltacat-1.1.35.dist-info/METADATA +0 -64
  222. deltacat-1.1.35.dist-info/RECORD +0 -219
  223. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  224. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  225. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  226. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  227. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  228. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  229. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  233. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  234. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  235. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,320 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Optional, Tuple, Union, List
5
+
6
+ import sys
7
+ import urllib
8
+ import pathlib
9
+
10
+ import pyarrow
11
+ import pyarrow as pa
12
+ from pyarrow.fs import (
13
+ _resolve_filesystem_and_path,
14
+ FileSelector,
15
+ FileInfo,
16
+ FileType,
17
+ FileSystem,
18
+ FSSpecHandler,
19
+ PyFileSystem,
20
+ )
21
+
22
+ _LOCAL_SCHEME = "local"
23
+
24
+
25
+ def resolve_paths_and_filesystem(
26
+ paths: Union[str, List[str]],
27
+ filesystem: pyarrow.fs.FileSystem = None,
28
+ ) -> Tuple[List[str], pyarrow.fs.FileSystem]:
29
+ """
30
+ Resolves and normalizes all provided paths, infers a filesystem from the
31
+ paths or validates the provided filesystem against the paths and ensures
32
+ that all paths use the same filesystem.
33
+
34
+ Args:
35
+ paths: A single file/directory path or a list of file/directory paths.
36
+ A list of paths can contain both files and directories.
37
+ filesystem: The filesystem implementation that should be used for
38
+ reading these files. If None, a filesystem will be inferred. If not
39
+ None, the provided filesystem will still be validated against all
40
+ filesystems inferred from the provided paths to ensure
41
+ compatibility.
42
+ """
43
+ if isinstance(paths, str):
44
+ paths = [paths]
45
+ if isinstance(paths, pathlib.Path):
46
+ paths = [str(paths)]
47
+ elif not isinstance(paths, list) or any(not isinstance(p, str) for p in paths):
48
+ raise ValueError(
49
+ "Expected `paths` to be a `str`, `pathlib.Path`, or `list[str]`, but got "
50
+ f"`{paths}`."
51
+ )
52
+ elif len(paths) == 0:
53
+ raise ValueError("Must provide at least one path.")
54
+
55
+ need_unwrap_path_protocol = True
56
+ if filesystem and not isinstance(filesystem, FileSystem):
57
+ err_msg = (
58
+ f"The filesystem passed must either conform to "
59
+ f"pyarrow.fs.FileSystem, or "
60
+ f"fsspec.spec.AbstractFileSystem. The provided "
61
+ f"filesystem was: {filesystem}"
62
+ )
63
+ try:
64
+ import fsspec
65
+ from fsspec.implementations.http import HTTPFileSystem
66
+ except ModuleNotFoundError:
67
+ # If filesystem is not a pyarrow filesystem and fsspec isn't
68
+ # installed, then filesystem is neither a pyarrow filesystem nor
69
+ # an fsspec filesystem, so we raise a TypeError.
70
+ raise TypeError(err_msg) from None
71
+ if not isinstance(filesystem, fsspec.spec.AbstractFileSystem):
72
+ raise TypeError(err_msg) from None
73
+ if isinstance(filesystem, HTTPFileSystem):
74
+ # If filesystem is fsspec HTTPFileSystem, the protocol/scheme of paths
75
+ # should not be unwrapped/removed, because HTTPFileSystem expects full file
76
+ # paths including protocol/scheme. This is different behavior compared to
77
+ # file systems implementation in pyarrow.fs.FileSystem.
78
+ need_unwrap_path_protocol = False
79
+
80
+ filesystem = PyFileSystem(FSSpecHandler(filesystem))
81
+
82
+ resolved_paths = []
83
+ for path in paths:
84
+ path = _resolve_custom_scheme(path)
85
+ try:
86
+ resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
87
+ path, filesystem
88
+ )
89
+ except pa.lib.ArrowInvalid as e:
90
+ if "Cannot parse URI" in str(e):
91
+ resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
92
+ _encode_url(path), filesystem
93
+ )
94
+ resolved_path = _decode_url(resolved_path)
95
+ elif "Unrecognized filesystem type in URI" in str(e):
96
+ scheme = urllib.parse.urlparse(path, allow_fragments=False).scheme
97
+ if scheme in ["http", "https"]:
98
+ # If scheme of path is HTTP and filesystem is not resolved,
99
+ # try to use fsspec HTTPFileSystem. This expects fsspec is
100
+ # installed.
101
+ try:
102
+ from fsspec.implementations.http import HTTPFileSystem
103
+ except ModuleNotFoundError:
104
+ raise ImportError(
105
+ "Please install fsspec to read files from HTTP."
106
+ ) from None
107
+
108
+ resolved_filesystem = PyFileSystem(FSSpecHandler(HTTPFileSystem()))
109
+ resolved_path = path
110
+ need_unwrap_path_protocol = False
111
+ else:
112
+ raise
113
+ else:
114
+ raise
115
+ if filesystem is None:
116
+ filesystem = resolved_filesystem
117
+ elif need_unwrap_path_protocol:
118
+ resolved_path = _unwrap_protocol(resolved_path)
119
+ resolved_path = filesystem.normalize_path(resolved_path)
120
+ resolved_paths.append(resolved_path)
121
+
122
+ return resolved_paths, filesystem
123
+
124
+
125
+ def resolve_path_and_filesystem(
126
+ path: str,
127
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
128
+ ) -> Tuple[str, pyarrow.fs.FileSystem]:
129
+ """
130
+ Resolves and normalizes the provided path, infers a filesystem from the
131
+ path or validates the provided filesystem against the path.
132
+
133
+ Args:
134
+ path: A single file/directory path.
135
+ filesystem: The filesystem implementation that should be used for
136
+ reading these files. If None, a filesystem will be inferred. If not
137
+ None, the provided filesystem will still be validated against all
138
+ filesystems inferred from the provided paths to ensure
139
+ compatibility.
140
+ """
141
+ paths, filesystem = resolve_paths_and_filesystem(
142
+ paths=path,
143
+ filesystem=filesystem,
144
+ )
145
+ assert len(paths) == 1, len(paths)
146
+ return paths[0], filesystem
147
+
148
+
149
+ def list_directory(
150
+ path: str,
151
+ filesystem: pyarrow.fs.FileSystem,
152
+ exclude_prefixes: Optional[List[str]] = None,
153
+ ignore_missing_path: bool = False,
154
+ recursive: bool = False,
155
+ ) -> List[Tuple[str, int]]:
156
+ """
157
+ Expand the provided directory path to a list of file paths.
158
+
159
+ Args:
160
+ path: The directory path to expand.
161
+ filesystem: The filesystem implementation that should be used for
162
+ reading these files.
163
+ exclude_prefixes: The file relative path prefixes that should be
164
+ excluded from the returned file set. Default excluded prefixes are
165
+ "." and "_".
166
+ recursive: Whether to expand subdirectories or not.
167
+
168
+ Returns:
169
+ An iterator of (file_path, file_size) tuples.
170
+ """
171
+ if exclude_prefixes is None:
172
+ exclude_prefixes = [".", "_"]
173
+
174
+ selector = FileSelector(
175
+ base_dir=path,
176
+ recursive=recursive,
177
+ allow_not_found=ignore_missing_path,
178
+ )
179
+ try:
180
+ files = filesystem.get_file_info(selector)
181
+ except OSError as e:
182
+ if isinstance(e, FileNotFoundError):
183
+ files = []
184
+ else:
185
+ _handle_read_os_error(e, path)
186
+ base_path = selector.base_dir
187
+ out = []
188
+ for file_ in files:
189
+ file_path = file_.path
190
+ if not file_path.startswith(base_path):
191
+ continue
192
+ relative = file_path[len(base_path) :]
193
+ if any(relative.startswith(prefix) for prefix in exclude_prefixes):
194
+ continue
195
+ out.append((file_path, file_.size))
196
+ # We sort the paths to guarantee a stable order.
197
+ return sorted(out)
198
+
199
+
200
+ def get_file_info(
201
+ path: str,
202
+ filesystem: pyarrow.fs.FileSystem,
203
+ ignore_missing_path: bool = False,
204
+ ) -> FileInfo:
205
+ """Get the file info for the provided path."""
206
+ try:
207
+ file_info = filesystem.get_file_info(path)
208
+ except OSError as e:
209
+ _handle_read_os_error(e, path)
210
+ if file_info.type == FileType.NotFound and not ignore_missing_path:
211
+ raise FileNotFoundError(path)
212
+
213
+ return file_info
214
+
215
+
216
+ def _handle_read_os_error(
217
+ error: OSError,
218
+ paths: Union[str, List[str]],
219
+ ) -> str:
220
+ # NOTE: this is not comprehensive yet, and should be extended as more errors arise.
221
+ # NOTE: The latter patterns are raised in Arrow 10+, while the former is raised in
222
+ # Arrow < 10.
223
+ aws_error_pattern = (
224
+ r"^(?:(.*)AWS Error \[code \d+\]: No response body\.(.*))|"
225
+ r"(?:(.*)AWS Error UNKNOWN \(HTTP status 400\) during HeadObject operation: "
226
+ r"No response body\.(.*))|"
227
+ r"(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response "
228
+ r"body\.(.*))$"
229
+ )
230
+ if re.match(aws_error_pattern, str(error)):
231
+ # Specially handle AWS error when reading files, to give a clearer error
232
+ # message to avoid confusing users. The real issue is most likely that the AWS
233
+ # S3 file credentials have not been properly configured yet.
234
+ if isinstance(paths, str):
235
+ # Quote to highlight single file path in error message for better
236
+ # readability. List of file paths will be shown up as ['foo', 'boo'],
237
+ # so only quote single file path here.
238
+ paths = f'"{paths}"'
239
+ raise OSError(
240
+ (
241
+ f"Failing to read AWS S3 file(s): {paths}. "
242
+ "Please check that file exists and has properly configured access. "
243
+ "You can also run AWS CLI command to get more detailed error message "
244
+ "(e.g., aws s3 ls <file-name>). "
245
+ "See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html " # noqa
246
+ "for more information."
247
+ )
248
+ )
249
+ else:
250
+ raise error
251
+
252
+
253
+ def _is_local_windows_path(path: str) -> bool:
254
+ """Determines if path is a Windows file-system location."""
255
+ if sys.platform != "win32":
256
+ return False
257
+
258
+ if len(path) >= 1 and path[0] == "\\":
259
+ return True
260
+ if (
261
+ len(path) >= 3
262
+ and path[1] == ":"
263
+ and (path[2] == "/" or path[2] == "\\")
264
+ and path[0].isalpha()
265
+ ):
266
+ return True
267
+ return False
268
+
269
+
270
+ def _unwrap_protocol(path):
271
+ """
272
+ Slice off any protocol prefixes on path.
273
+ """
274
+ if sys.platform == "win32" and _is_local_windows_path(path):
275
+ # Represent as posix path such that downstream functions properly handle it.
276
+ # This is executed when 'file://' is NOT included in the path.
277
+ return pathlib.Path(path).as_posix()
278
+
279
+ parsed = urllib.parse.urlparse(path, allow_fragments=False) # support '#' in path
280
+ query = "?" + parsed.query if parsed.query else "" # support '?' in path
281
+ netloc = parsed.netloc
282
+ if parsed.scheme == "s3" and "@" in parsed.netloc:
283
+ # If the path contains an @, it is assumed to be an anonymous
284
+ # credentialed path, and we need to strip off the credentials.
285
+ netloc = parsed.netloc.split("@")[-1]
286
+
287
+ parsed_path = parsed.path
288
+ # urlparse prepends the path with a '/'. This does not work on Windows
289
+ # so if this is the case strip the leading slash.
290
+ if (
291
+ sys.platform == "win32"
292
+ and not netloc
293
+ and len(parsed_path) >= 3
294
+ and parsed_path[0] == "/" # The problematic leading slash
295
+ and parsed_path[1].isalpha() # Ensure it is a drive letter.
296
+ and parsed_path[2:4] in (":", ":/")
297
+ ):
298
+ parsed_path = parsed_path[1:]
299
+
300
+ return netloc + parsed_path + query
301
+
302
+
303
+ def _encode_url(path):
304
+ return urllib.parse.quote(path, safe="/:")
305
+
306
+
307
+ def _decode_url(path):
308
+ return urllib.parse.unquote(path)
309
+
310
+
311
+ def _resolve_custom_scheme(path: str) -> str:
312
+ """Returns the resolved path if the given path follows a Ray-specific custom
313
+ scheme. Othewise, returns the path unchanged.
314
+
315
+ The supported custom schemes are: "local", "example".
316
+ """
317
+ parsed_uri = urllib.parse.urlparse(path)
318
+ if parsed_uri.scheme == _LOCAL_SCHEME:
319
+ path = parsed_uri.netloc + parsed_uri.path
320
+ return path
@@ -0,0 +1,73 @@
1
+ import posixpath
2
+ import pyarrow.fs
3
+
4
+ from deltacat.storage.model.partition import PartitionLocator
5
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
6
+
7
+ """
8
+ Helper functions to work with deltacat metadata paths.
9
+ TODO: Replace with direct calls to Deltacat storage interface.
10
+ """
11
+
12
+
13
+ def _find_first_child_with_rev(
14
+ parent_path: str, filesystem: pyarrow.fs.FileSystem
15
+ ) -> str:
16
+ """
17
+ Walks the filesystem to find the first child directory with a `rev/` folder.
18
+
19
+ This is a temporary solution to locate the first Namespace and Table directories.
20
+ The Deltacat Storage interface will provide a more robust way to locate these directories.
21
+
22
+ param: parent_path: The parent directory to search for a child with a `rev/` folder.
23
+ param: filesystem: The filesystem to search for the child directory.
24
+ returns: The name of the first child directory with a `rev/` folder.
25
+ """
26
+ children = filesystem.get_file_info(
27
+ pyarrow.fs.FileSelector(parent_path, allow_not_found=True)
28
+ )
29
+ for child in children:
30
+ if child.type == pyarrow.fs.FileType.Directory:
31
+ rev_path = posixpath.join(child.path, "rev")
32
+ if filesystem.get_file_info(rev_path).type == pyarrow.fs.FileType.Directory:
33
+ return child.base_name
34
+ raise ValueError(f"No directory with 'rev/' found under {parent_path}")
35
+
36
+
37
+ def _find_table_path(root_path: str, filesystem: pyarrow.fs.FileSystem):
38
+ """
39
+ Finds a path with structure: root/namespace_id/table_id
40
+ Uses _find_first_child_with_rev to determine the namespace and table ids.
41
+
42
+ param: root_path: The root directory to search for the namespace and table directories.
43
+ param: filesystem: The filesystem to search for the namespace and table directories.
44
+ returns: The path to the table directory.
45
+ raises: ValueError if the namespace or table directories are not found.
46
+ """
47
+ try:
48
+ # Find Namespace (first directory under root with rev/)
49
+ namespace_id = _find_first_child_with_rev(root_path, filesystem)
50
+ namespace_path = posixpath.join(root_path, namespace_id)
51
+
52
+ # Find Table (first directory under namespace with rev/)
53
+ table_id = _find_first_child_with_rev(namespace_path, filesystem)
54
+ return posixpath.join(namespace_path, table_id)
55
+
56
+ except ValueError as e:
57
+ raise ValueError(f"Failed to locate Namespace or Table: {e}") from e
58
+
59
+
60
+ def _find_partition_path(root_path: str, locator: PartitionLocator) -> str:
61
+ """
62
+ Finds the path to the partition directory for the specified locator.
63
+
64
+ param: root_uri: The root URI of the dataset.
65
+ param: locator: The DeltaLocator for the delta.
66
+ returns: The path to the delta directory.
67
+ """
68
+ root_path, filesystem = resolve_path_and_filesystem(root_path)
69
+ return posixpath.join(
70
+ _find_table_path(root_path, filesystem),
71
+ locator.table_version,
72
+ locator.stream_id,
73
+ )
deltacat/utils/pyarrow.py CHANGED
@@ -1,7 +1,6 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- import copy
5
4
  import bz2
6
5
  import gzip
7
6
  import io
@@ -47,19 +46,6 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
47
46
 
48
47
  RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
49
48
  READER_TYPE_KWARG = "reader_type"
50
- OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG = "override_content_encoding_for_parquet"
51
-
52
- """
53
- By default, round decimal values using half_to_even round mode when
54
- rescaling a decimal to the given scale and precision in the schema would cause
55
- data loss. Setting any non null value of this argument will result
56
- in an error instead.
57
- """
58
- RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
59
- # Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
60
- DECIMAL256_DEFAULT_SCALE = 38
61
- DECIMAL256_MAX_PRECISION = 76
62
- MAX_INT_BYTES = 2147483646
63
49
 
64
50
 
65
51
  def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
@@ -78,164 +64,45 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
78
64
  return target_schema
79
65
 
80
66
 
81
- def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
82
- schema = None
83
- if (
84
- "convert_options" in kwargs
85
- and kwargs["convert_options"].column_types is not None
86
- ):
87
- schema = kwargs["convert_options"].column_types
88
- if not isinstance(schema, pa.Schema):
89
- schema = pa.schema(schema)
90
- if kwargs["convert_options"].include_columns:
91
- schema = _filter_schema_for_columns(
92
- schema, kwargs["convert_options"].include_columns
93
- )
94
- elif (
95
- kwargs.get("read_options") is not None
96
- and kwargs["read_options"].column_names
97
- ):
98
- schema = _filter_schema_for_columns(
99
- schema, kwargs["read_options"].column_names
100
- )
101
- else:
102
- logger.debug(
103
- "Schema not specified in the kwargs."
104
- " Hence, schema could not be inferred from the empty CSV."
105
- )
106
-
107
- return schema
108
-
109
-
110
- def _new_schema_with_replaced_fields(
111
- schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
112
- ) -> pa.Schema:
113
- if schema is None:
114
- return None
115
-
116
- new_schema_fields = []
117
- for field in schema:
118
- new_field = field_to_replace(field)
119
- if new_field is not None:
120
- new_schema_fields.append(new_field)
121
- else:
122
- new_schema_fields.append(field)
123
-
124
- return pa.schema(new_schema_fields, metadata=schema.metadata)
125
-
126
-
127
- def _read_csv_rounding_decimal_columns_to_fit_scale(
128
- schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
129
- ) -> pa.Table:
130
- # Note: We read decimals as strings first because CSV
131
- # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
132
- new_schema = _new_schema_with_replaced_fields(
133
- schema,
134
- lambda fld: (
135
- pa.field(fld.name, pa.string(), metadata=fld.metadata)
136
- if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
137
- else None
138
- ),
139
- )
140
- new_kwargs = sanitize_kwargs_by_supported_kwargs(
141
- ["read_options", "parse_options", "convert_options", "memory_pool"],
142
- reader_kwargs,
143
- )
144
- # Creating a shallow copy for efficiency
145
- new_convert_options = copy.copy(new_kwargs["convert_options"])
146
- new_convert_options.column_types = new_schema
147
- new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
148
- arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
149
-
150
- for column_index, field in enumerate(schema):
151
- if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
152
- column_array = arrow_table[field.name]
153
- # We always cast to decimal256 to accomodate fixed scale of 38
154
- cast_to_type = pa.decimal256(
155
- DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
156
- )
157
- casted_decimal_array = pc.cast(column_array, cast_to_type)
158
- # Note that scale can be negative
159
- rounded_column_array = pc.round(
160
- casted_decimal_array, ndigits=field.type.scale
161
- )
162
- final_decimal_array = pc.cast(rounded_column_array, field.type)
163
- arrow_table = arrow_table.set_column(
164
- column_index,
165
- field,
166
- final_decimal_array,
167
- )
168
- logger.debug(
169
- f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
170
- f" {field.type.precision} precision"
171
- )
172
-
173
- return arrow_table
174
-
175
-
176
- def pyarrow_read_csv_default(*args, **kwargs):
177
- new_kwargs = sanitize_kwargs_by_supported_kwargs(
178
- ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
179
- )
180
-
67
+ def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
181
68
  try:
69
+ new_kwargs = sanitize_kwargs_by_supported_kwargs(
70
+ ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
71
+ )
182
72
  return pacsv.read_csv(*args, **new_kwargs)
183
73
  except pa.lib.ArrowInvalid as e:
184
- error_str = e.__str__()
185
- schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
186
-
187
- if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
188
- logger.debug(f"Read CSV empty schema being used: {schema}")
189
- return pa.Table.from_pylist([], schema=schema)
190
- if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
191
- # Note, this logic requires expensive casting. To prevent downgrading performance
192
- # for happy path reads, we are handling this case in response to an error.
193
- logger.warning(
194
- "Rescaling Decimal to the given scale in the schema. "
195
- f"Original error: {error_str}"
196
- )
197
-
198
- if schema is not None and "convert_options" in kwargs:
199
- if (
200
- "Rescaling Decimal" in error_str
201
- and "value would cause data loss" in error_str
74
+ if e.__str__() == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
75
+ schema = None
76
+ if (
77
+ "convert_options" in kwargs
78
+ and kwargs["convert_options"].column_types is not None
79
+ ):
80
+ schema = kwargs["convert_options"].column_types
81
+ if not isinstance(schema, pa.Schema):
82
+ schema = pa.schema(schema)
83
+ if kwargs["convert_options"].include_columns:
84
+ schema = _filter_schema_for_columns(
85
+ schema, kwargs["convert_options"].include_columns
86
+ )
87
+ elif (
88
+ kwargs.get("read_options") is not None
89
+ and kwargs["read_options"].column_names
202
90
  ):
203
- logger.debug(f"Checking if the file: {args[0]}...")
204
- # Since we are re-reading the file, we have to seek to beginning
205
- if isinstance(args[0], io.IOBase) and args[0].seekable():
206
- logger.debug(f"Seeking to the beginning of the file {args[0]}")
207
- args[0].seek(0)
208
- return _read_csv_rounding_decimal_columns_to_fit_scale(
209
- schema=schema, reader_args=args, reader_kwargs=kwargs
91
+ schema = _filter_schema_for_columns(
92
+ schema, kwargs["read_options"].column_names
210
93
  )
94
+
211
95
  else:
212
96
  logger.debug(
213
- "Schema is None when trying to adjust decimal values. "
214
- "Hence, bubbling up exception..."
97
+ "Schema not specified in the kwargs."
98
+ " Hence, schema could not be inferred from the empty CSV."
215
99
  )
216
100
 
101
+ logger.debug(f"Read CSV empty schema being used: {schema}")
102
+ return pa.Table.from_pylist([], schema=schema)
217
103
  raise e
218
104
 
219
105
 
220
- def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
221
- schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
222
-
223
- # CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
224
- # Below ensures decimal256 is casted properly.
225
- schema_includes_decimal256 = (
226
- (True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
227
- if schema is not None
228
- else None
229
- )
230
- if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
231
- # falling back to expensive method of reading CSV
232
- return _read_csv_rounding_decimal_columns_to_fit_scale(
233
- schema, reader_args=args, reader_kwargs=kwargs
234
- )
235
- else:
236
- return pyarrow_read_csv_default(*args, **kwargs)
237
-
238
-
239
106
  CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
240
107
  ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
241
108
  ContentType.TSV.value: pyarrow_read_csv,
@@ -544,15 +411,6 @@ def s3_file_to_table(
544
411
  if pa_read_func_kwargs_provider is not None:
545
412
  kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
546
413
 
547
- if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
548
- new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
549
- if content_type == ContentType.PARQUET.value:
550
- logger.debug(
551
- f"Overriding {s3_url} content encoding from {content_encoding} "
552
- f"to {new_content_encoding}"
553
- )
554
- content_encoding = new_content_encoding
555
-
556
414
  if (
557
415
  content_type == ContentType.PARQUET.value
558
416
  and content_encoding == ContentEncoding.IDENTITY.value
@@ -582,8 +440,8 @@ def s3_file_to_table(
582
440
  **s3_client_kwargs,
583
441
  )
584
442
 
585
- if READER_TYPE_KWARG in kwargs:
586
- kwargs.pop(READER_TYPE_KWARG)
443
+ if READER_TYPE_KWARG in kwargs:
444
+ kwargs.pop(READER_TYPE_KWARG)
587
445
 
588
446
  filesystem = io
589
447
  if s3_url.startswith("s3://"):
@@ -617,18 +475,7 @@ def s3_file_to_parquet(
617
475
  f"Reading {s3_url} to PyArrow ParquetFile. "
618
476
  f"Content type: {content_type}. Encoding: {content_encoding}"
619
477
  )
620
- kwargs = {}
621
- if pa_read_func_kwargs_provider:
622
- kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
623
478
 
624
- if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
625
- new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
626
- if content_type == ContentType.PARQUET.value:
627
- logger.debug(
628
- f"Overriding {s3_url} content encoding from {content_encoding} "
629
- f"to {new_content_encoding}"
630
- )
631
- content_encoding = new_content_encoding
632
479
  if (
633
480
  content_type != ContentType.PARQUET.value
634
481
  or content_encoding != ContentEncoding.IDENTITY
@@ -641,10 +488,15 @@ def s3_file_to_parquet(
641
488
  if s3_client_kwargs is None:
642
489
  s3_client_kwargs = {}
643
490
 
491
+ kwargs = {}
492
+
644
493
  if s3_url.startswith("s3://"):
645
494
  s3_file_system = create_s3_file_system(s3_client_kwargs)
646
495
  kwargs["filesystem"] = s3_file_system
647
496
 
497
+ if pa_read_func_kwargs_provider:
498
+ kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
499
+
648
500
  logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
649
501
 
650
502
  kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
@@ -931,6 +783,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
931
783
  TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
932
784
  """
933
785
  dtype = array.type
786
+ MAX_BYTES = 2147483646
934
787
  max_str_len = None
935
788
  if pa.types.is_integer(dtype):
936
789
  max_str_len = _int_max_string_len()
@@ -942,7 +795,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
942
795
  max_str_len = _max_decimal256_string_len()
943
796
 
944
797
  if max_str_len is not None:
945
- max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len) # safety factor of 2
798
+ max_elems_per_chunk = MAX_BYTES // (2 * max_str_len) # safety factor of 2
946
799
  all_chunks = []
947
800
  for chunk in array.chunks:
948
801
  if len(chunk) < max_elems_per_chunk: