deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,178 @@
1
+ import logging
2
+ import posixpath
3
+ from typing import Generator, Optional
4
+
5
+ import pyarrow
6
+ import pyarrow.fs
7
+
8
+ from deltacat.storage import Delta
9
+ from deltacat.storage.model.partition import PartitionLocator
10
+ from deltacat.storage.rivulet.fs.file_provider import FileProvider
11
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
12
+ from deltacat.storage.rivulet.metastore.json_sst import JsonSstReader
13
+ from deltacat.storage.rivulet.metastore.delta import (
14
+ ManifestIO,
15
+ DeltaContext,
16
+ RivuletDelta,
17
+ DeltacatManifestIO,
18
+ )
19
+ from deltacat.storage.rivulet.metastore.sst import SSTReader, SSTable
20
+ from deltacat.utils.metafile_locator import _find_table_path
21
+ from deltacat import logs
22
+
23
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
24
+
25
+
26
+ class ManifestAccessor:
27
+ """Accessor for retrieving a manifest's SSTable entities."""
28
+
29
+ def __init__(
30
+ self, delta: RivuletDelta, file_provider: FileProvider, sst_reader: SSTReader
31
+ ):
32
+ self.manifest: RivuletDelta = delta
33
+ self.file_provider: FileProvider = file_provider
34
+ self._sst_reader = sst_reader
35
+
36
+ @property
37
+ def context(self) -> DeltaContext:
38
+ return self.manifest.context
39
+
40
+ def generate_sstables(self) -> Generator[SSTable, None, None]:
41
+ """
42
+ Generate the SortedString Tables from this Manifest
43
+
44
+ :return a generator of SSTables for this manifest
45
+ """
46
+ for sst_uri in self.manifest.sst_files:
47
+ sst_file = self.file_provider.provide_input_file(sst_uri)
48
+ yield self._sst_reader.read(sst_file)
49
+
50
+
51
+ class DatasetMetastore:
52
+ """
53
+ Metastore implementation for manifests stored on a filesystem
54
+
55
+ TODO this will be replaced with Deltacat Storage interface - https://github.com/ray-project/deltacat/issues/477
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ # URI at which we expect to find deltas
61
+ delta_root_uri: str,
62
+ file_provider: FileProvider,
63
+ locator: PartitionLocator,
64
+ *,
65
+ manifest_io: ManifestIO = None,
66
+ sst_reader: SSTReader = None,
67
+ ):
68
+ self._min_key = None
69
+ self._max_key = None
70
+ self.delta_root_uri = delta_root_uri
71
+ self.file_provider = file_provider
72
+ self.manifest_io = manifest_io or DeltacatManifestIO(delta_root_uri, locator)
73
+ self.sst_reader = sst_reader or JsonSstReader()
74
+ self.locator = locator
75
+
76
+ def _get_delta(
77
+ self, delta_dir: str, filesystem: pyarrow.fs.FileSystem
78
+ ) -> Optional[RivuletDelta]:
79
+ """
80
+ Find the latest revision in a delta directory.
81
+
82
+ param: delta_dir: The directory containing the revisions.
83
+ param: filesystem: The filesystem to search for the revisions.
84
+ returns: The latest revision as a RivuletDelta.
85
+ """
86
+ rev_directory = posixpath.join(delta_dir, "rev")
87
+ revisions = filesystem.get_file_info(
88
+ pyarrow.fs.FileSelector(rev_directory, allow_not_found=True)
89
+ )
90
+
91
+ if not revisions:
92
+ logger.warning(f"No revision files found in {rev_directory}")
93
+ return None
94
+
95
+ # Take lexicographical max to find the latest revision
96
+ latest_revision = max(revisions, key=lambda f: f.path)
97
+
98
+ return (
99
+ RivuletDelta.of(Delta.read(latest_revision.path))
100
+ if latest_revision
101
+ else None
102
+ )
103
+
104
+ def generate_manifests(self) -> Generator[ManifestAccessor, None, None]:
105
+ """
106
+ Generate all manifests within the Metastore
107
+ NOTE: this will be replaced by deltacat storage API.
108
+
109
+ TODO: Generate partition path using Deltacat Storage interface.
110
+
111
+ param: delta_root_uri: The URI at which we expect to find deltas.
112
+ returns: a generator of ManifestAccessors for all manifests in the dataset.
113
+ """
114
+
115
+ root_path, filesystem = resolve_path_and_filesystem(self.delta_root_uri)
116
+
117
+ partition_path = posixpath.join(
118
+ _find_table_path(root_path, filesystem),
119
+ self.locator.table_version,
120
+ self.locator.stream_id,
121
+ self.locator.partition_id,
122
+ )
123
+
124
+ partition_info = filesystem.get_file_info(partition_path)
125
+
126
+ if partition_info.type != pyarrow.fs.FileType.Directory:
127
+ logger.debug(f"Partition directory {partition_path} not found. Skipping.")
128
+ return
129
+
130
+ # Locate "rev" directory inside the partition
131
+ rev_directory = posixpath.join(partition_path, "rev")
132
+ rev_info = filesystem.get_file_info(rev_directory)
133
+
134
+ if rev_info.type != pyarrow.fs.FileType.Directory:
135
+ logger.debug(f"Revision directory {rev_directory} not found. Skipping.")
136
+ return
137
+
138
+ # Fetch all delta directories inside the partition
139
+ delta_dirs = filesystem.get_file_info(
140
+ pyarrow.fs.FileSelector(
141
+ partition_path, allow_not_found=True, recursive=False
142
+ )
143
+ )
144
+
145
+ delta_dirs = [
146
+ delta
147
+ for delta in delta_dirs
148
+ if delta.type == pyarrow.fs.FileType.Directory and delta.base_name.isdigit()
149
+ ]
150
+
151
+ for delta_dir in delta_dirs:
152
+ rivulet_delta = self._get_delta(delta_dir.path, filesystem)
153
+ if rivulet_delta:
154
+ yield ManifestAccessor(
155
+ rivulet_delta, self.file_provider, self.sst_reader
156
+ )
157
+
158
+ def get_min_max_keys(self):
159
+ """
160
+ Compute and cache the minimum and maximum keys in the dataset.
161
+
162
+ returns: a tuple of the minimum and maximum keys in the dataset
163
+ """
164
+ if self._min_key is not None and self._max_key is not None:
165
+ return (self._min_key, self._max_key)
166
+
167
+ min_key = None
168
+ max_key = None
169
+ for manifest_accessor in self.generate_manifests():
170
+ for sstable in manifest_accessor.generate_sstables():
171
+ if min_key is None or sstable.min_key < min_key:
172
+ min_key = sstable.min_key
173
+ if max_key is None or sstable.max_key > max_key:
174
+ max_key = sstable.max_key
175
+
176
+ self._min_key = min_key
177
+ self._max_key = max_key
178
+ return (min_key, max_key)
@@ -0,0 +1,156 @@
1
+ import logging
2
+ from typing import Generator, Optional, Set, Type, TypeVar, Any
3
+
4
+ from deltacat.storage.model.shard import Shard
5
+ from deltacat.storage.rivulet.metastore.sst import SSTableRow, SSTable
6
+ from deltacat.storage.rivulet.metastore.sst_interval_tree import (
7
+ BlockIntervalTree,
8
+ OrderedBlockGroups,
9
+ )
10
+ from deltacat.storage.rivulet.reader.block_scanner import BlockScanner
11
+ from deltacat.storage.rivulet.reader.dataset_metastore import (
12
+ DatasetMetastore,
13
+ ManifestAccessor,
14
+ )
15
+ from deltacat.storage.rivulet.reader.query_expression import QueryExpression
16
+ from deltacat.storage.rivulet import Schema
17
+
18
+ # The type of data returned to reader
19
+ T = TypeVar("T")
20
+
21
+
22
+ class DatasetReader:
23
+ """
24
+ DatasetReader is an internal class used to execute a scan
25
+
26
+ TODO - Currently, this reader is limited to reading a single field group
27
+ The next CR will fast follow to modify this to read and zipper multiple field groups
28
+
29
+ TODO currently this assumes all SST files are L0 files with overlapping key ranges
30
+ Future CR will support L1+ SSTs
31
+ """
32
+
33
+ BLOCK_READER_POOL_SIZE = 8
34
+
35
+ def __init__(self, metastore: DatasetMetastore):
36
+ self.metastore: DatasetMetastore = metastore
37
+ self.block_scanner = BlockScanner(self.metastore)
38
+
39
+ def scan(
40
+ self,
41
+ schema: Schema,
42
+ deserialize_to: Type[T],
43
+ query: QueryExpression[Any](),
44
+ shard: Optional[Shard] = None,
45
+ ) -> Generator[T, None, None]:
46
+ """
47
+ Scan records given query and deserialize to desired memory output format
48
+
49
+ # TODO handle "partial schema" use case, in which the query schema is a subset of full schema
50
+
51
+ # TODO this is where we will do the ziper merge when we support multiple field groups
52
+ # for each SST row which may overlap key range, read data chunk
53
+ # we will later improve and parallelize this when we do zipper merge work
54
+ """
55
+
56
+ # Read manifests and differentiate between "full schema" and "zipper merge" use case
57
+ manifests = set(self.metastore.generate_manifests())
58
+ schemas = set([manifest.context.schema for manifest in manifests])
59
+ levels = set([manifest.context.level for manifest in manifests])
60
+ # Must zipper if there are multiple schemas
61
+ cannot_avoid_zipper = len(schemas) > 1
62
+ # Must zipper if L0 is involved or if manifests span multiple levels
63
+ cannot_avoid_zipper |= 0 in levels or len(levels) > 0
64
+
65
+ if cannot_avoid_zipper:
66
+ logging.info(f"Done scanning manifests. Can avoid zipper-merge")
67
+ for scan_result in self.__scan_with_zipper(
68
+ schema, deserialize_to, manifests, query, shard=shard
69
+ ):
70
+ yield scan_result
71
+ else:
72
+ logging.info(f"Done scanning manifests. Must perform zipper-merge")
73
+ for scan_result in self.__scan_no_zipper(
74
+ schema, deserialize_to, manifests, query, shard=shard
75
+ ):
76
+ yield scan_result
77
+
78
+ def __scan_no_zipper(
79
+ self,
80
+ schema: Schema,
81
+ deserialize_to: Type[T],
82
+ manifests: Set[ManifestAccessor],
83
+ query: QueryExpression[Any](),
84
+ shard: Optional[Shard] = None,
85
+ ) -> Generator[T, None, None]:
86
+ # Build final query using user query and shard boundaries (ensures only blocks in shard and query range are read).
87
+ # TODO: improve query expression implementation to have a builder of some sort.
88
+ query = QueryExpression().with_shard(query, shard)
89
+ # Map manifests to all SST rows which match query
90
+ matching_sst_rows: Set[SSTableRow] = {
91
+ row
92
+ for manifest in manifests
93
+ for table in manifest.generate_sstables()
94
+ for row in self.__load_sst_rows(table, query)
95
+ }
96
+
97
+ for result_row in self.block_scanner.scan(
98
+ schema, deserialize_to, matching_sst_rows, query
99
+ ):
100
+ yield result_row
101
+
102
+ def __scan_with_zipper(
103
+ self,
104
+ schema: Schema,
105
+ deserialize_to: Type[T],
106
+ manifests: Set[ManifestAccessor],
107
+ query: QueryExpression[Any](),
108
+ shard: Optional[Shard] = None,
109
+ ) -> Generator[T, None, None]:
110
+ # Build final query using user query and shard boundaries (ensures only blocks in shard and query range are read).
111
+ # TODO: improve query expression implementation to have a builder of some sort.
112
+ query = QueryExpression().with_shard(query, shard)
113
+ # Build interval tree from manifests and plan scan
114
+ sst_interval_tree = BlockIntervalTree()
115
+ for manifest in manifests:
116
+ for table in manifest.generate_sstables():
117
+ rows = self.__load_sst_rows(table, query)
118
+ sst_interval_tree.add_sst_rows(rows, manifest.context)
119
+
120
+ scan_block_groups: OrderedBlockGroups = (
121
+ sst_interval_tree.get_sorted_block_groups(query.min_key, query.max_key)
122
+ )
123
+ for result_row in self.block_scanner.scan_with_zipper(
124
+ schema, deserialize_to, scan_block_groups, query
125
+ ):
126
+ yield result_row
127
+
128
+ def __load_sst_rows(
129
+ self, table: SSTable, query: QueryExpression
130
+ ) -> Set[SSTableRow]:
131
+ # Short circuit table if there isn't any overlap with min and max
132
+ if not self.__overlaps_primary_key_range(query, table.min_key, table.max_key):
133
+ return set()
134
+ return {
135
+ r
136
+ for r in table.rows
137
+ if self.__overlaps_primary_key_range(query, r.key_min, r.key_max)
138
+ }
139
+
140
+ def __overlaps_primary_key_range(
141
+ self, query: QueryExpression, min_key, max_key
142
+ ) -> bool:
143
+ """
144
+ Helper method to check whether a query expression has overlap with a primary key range
145
+ """
146
+ # If no PK range set, the query is across all primary keys, so return true
147
+ if not query.key_range:
148
+ return True
149
+
150
+ query_start, query_end = query.key_range
151
+ if query_end < min_key:
152
+ return False
153
+ elif query_start > max_key:
154
+ return False
155
+ else:
156
+ return True
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Generator, Dict, Type, NamedTuple, List
4
+
5
+ from pyarrow import RecordBatch
6
+
7
+ from deltacat.storage.rivulet.reader.data_reader import DataReader, MEMORY_FORMAT
8
+ import pyarrow as pa
9
+
10
+
11
+ class RecordBatchRowIndex(NamedTuple):
12
+ """
13
+ Named tuple for a record batch with an index into a specific row
14
+ Note that record batches store data by column, so the row index should be
15
+ used to index into each column array
16
+ """
17
+
18
+ batch: RecordBatch
19
+ row_index: int
20
+
21
+
22
+ class ArrowDataReader(DataReader[RecordBatchRowIndex]):
23
+ """
24
+ Parquet reader to iteratively load records from parquet files
25
+ """
26
+
27
+ def deserialize_records(
28
+ self, record: RecordBatchRowIndex, output_type: Type[MEMORY_FORMAT]
29
+ ) -> Generator[MEMORY_FORMAT, None, None]:
30
+ """
31
+ Deserialize records into the specified format.
32
+
33
+ Note that output_type gets set based on what a DataScan converts results to,
34
+ e.g. to_arrow, to_dict
35
+
36
+ :param record: Input data (generated by generate_records method)
37
+ :param output_type: Type to deserialize into
38
+ :returns: A generator yielding records of the specified type.
39
+ """
40
+ batch, row_idx = record[0].batch, record[0].row_index
41
+
42
+ if output_type == Dict:
43
+ yield {
44
+ column: batch.column(column_idx)[row_idx].as_py()
45
+ for column_idx, column in enumerate(batch.column_names)
46
+ }
47
+
48
+ elif output_type == RecordBatch:
49
+ # only yield full record batch if row_idx is 0.
50
+ # TODO this logic will need to change in zipper use case across data formats
51
+ if row_idx == 0:
52
+ yield batch
53
+
54
+ def join_deserialize_records(
55
+ self,
56
+ records: List[RecordBatchRowIndex],
57
+ output_type: Type[MEMORY_FORMAT],
58
+ join_key: str,
59
+ ) -> Generator[MEMORY_FORMAT, None, None]:
60
+ """
61
+ Deserialize records into the specified format.
62
+
63
+ Note that output_type gets set based on what a DataScan converts results to,
64
+ e.g. to_arrow, to_dict
65
+
66
+ :param records: Input data (generated by generate_records method)
67
+ :param output_type: Type to deserialize into
68
+ :returns: A generator yielding records of the specified type.
69
+ """
70
+
71
+ if output_type == Dict:
72
+ yield self.__join_records_as_dict(records)
73
+ elif output_type == RecordBatch:
74
+ yield self.__join_records_as_record_batch(records, join_key)
75
+
76
+ @staticmethod
77
+ def __join_records_as_dict(records: List[RecordBatchRowIndex]) -> Dict[str, any]:
78
+ """
79
+ Deserialize records into a PyDict
80
+
81
+ :param records: input record data
82
+ :returns: A PyDict that's joined the given records around the primary key.
83
+ """
84
+ batch: RecordBatch
85
+ row_idx: int
86
+ out = {}
87
+ for r in records:
88
+ batch, row_idx = r
89
+ # Note this stomps over join key but that's OK
90
+ for column_idx, column in enumerate(batch.schema.names):
91
+ col = batch.column(column_idx)
92
+ if len(col) <= row_idx:
93
+ raise IndexError(
94
+ f"row index {row_idx} out of bounds for column {column} with length {len(col)}"
95
+ )
96
+
97
+ out.update({column: col[row_idx].as_py()})
98
+ return out
99
+
100
+ @staticmethod
101
+ def __join_records_as_record_batch(
102
+ records: List[RecordBatchRowIndex], join_key: str
103
+ ) -> RecordBatch:
104
+ """
105
+ Deserialize records into a RecordBatch
106
+
107
+ :param records: input record data
108
+ :returns: RecordBatch that's inner-joined the given records around the primary key.
109
+ """
110
+ batch: RecordBatch
111
+ row_idx: int
112
+ out: pa.Table | None = None
113
+ for record in records:
114
+ batch, row_idx = record
115
+ batch_slice: RecordBatch = batch.slice(row_idx, 1)
116
+ if not out:
117
+ out = pa.Table.from_batches([batch_slice])
118
+ else:
119
+ table2 = pa.Table.from_batches([batch_slice])
120
+ out = out.join(table2, keys=join_key, join_type="inner")
121
+ return out.to_batches()[0]
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ import typing
4
+ from typing import Optional
5
+
6
+ from deltacat.storage.model.shard import Shard
7
+
8
+ T = typing.TypeVar("T") # Type of primary key in query expression. Must be comparable
9
+
10
+
11
+ class QueryExpression(typing.Generic[T]):
12
+ """
13
+ Top level class for creating representing queries on a riv dataset.
14
+
15
+ For now, this is a minimal implementation which just allows for different predicates.
16
+
17
+ FUTURE IMPROVEMENTS
18
+ 1. Support builder using operator overloading or fluent builder pattern,e.g.
19
+ (operator overloading) query = Column("Foo") < 10 & Column("PK")==100
20
+ (fluent interface) query = builder.column("colA").less_than(10)
21
+ .and_()
22
+ .column("PK").equals(100)
23
+ .build()
24
+
25
+ 2. Support better push down predicate integration end to end. Specifically,
26
+ scan operation will need to return which query predicates were honored
27
+ """
28
+
29
+ def __init__(self):
30
+ self.key_range: Optional[(T, T)] = None
31
+
32
+ def with_key(self, val: T) -> QueryExpression:
33
+ """
34
+ Syntactic sugar for setting key range to a single value
35
+ """
36
+ if self.key_range:
37
+ raise ValueError(
38
+ f"Query expression already has set key range to: {self.key_range}"
39
+ )
40
+ self.key_range = (val, val)
41
+ return self
42
+
43
+ def with_range(self, bound1: T, bound2: T) -> QueryExpression:
44
+ if self.key_range:
45
+ raise ValueError(f"Key range already set to {self.key_range}")
46
+ self.key_range = tuple(sorted([bound1, bound2]))
47
+ return self
48
+
49
+ @staticmethod
50
+ def with_shard(query: Optional[QueryExpression], shard: Shard):
51
+ """
52
+ Generate a query expression that accounts for the shard boundaries.
53
+ Shard boundaries are inclusive and mark the outer bounds of the query.
54
+ """
55
+ if shard is None:
56
+ return query
57
+
58
+ if query.key_range is None:
59
+ return QueryExpression().with_range(shard.min_key, shard.max_key)
60
+
61
+ min_key = shard.min_key
62
+ max_key = shard.max_key
63
+
64
+ if min_key > query.min_key:
65
+ min_key = query.min_key
66
+
67
+ if max_key < query.max_key:
68
+ max_key = query.max_key
69
+
70
+ return QueryExpression().with_range(min_key, max_key)
71
+
72
+ @property
73
+ def min_key(self) -> T | None:
74
+ if not self.key_range:
75
+ return None
76
+ return self.key_range[0]
77
+
78
+ @property
79
+ def max_key(self) -> T | None:
80
+ if not self.key_range:
81
+ return None
82
+ return self.key_range[1]
83
+
84
+ def matches_query(self, key: any) -> bool:
85
+ """
86
+ Returns true if the key is within the range of the query expression
87
+ """
88
+ if not self.key_range:
89
+ return True
90
+ return self.min_key <= key <= self.max_key
91
+
92
+ def below_query_range(self, key: any) -> bool:
93
+ """
94
+ Returns true if the key is below the range of the query expression
95
+ will return false if key range is not set
96
+ """
97
+ if not self.key_range:
98
+ return False
99
+ return self.min_key > key
@@ -0,0 +1,84 @@
1
+ from deltacat.storage.rivulet.fs.file_provider import FileProvider
2
+ from deltacat.storage.rivulet.metastore.sst import SSTableRow
3
+ from deltacat.storage.rivulet.reader.data_reader import FileReader
4
+ from typing import Type, Dict
5
+
6
+ from deltacat.storage.rivulet.schema.schema import Schema
7
+
8
+
9
+ class FileReaderRegistrar:
10
+ """
11
+ Registrar for readers of rivulet data
12
+
13
+ Readers must adhere to the Protocol DataReader
14
+
15
+ Packages with extension classes should call into this registrar in __init__.py
16
+ """
17
+
18
+ _readers = {}
19
+
20
+ @classmethod
21
+ def register_reader(
22
+ cls,
23
+ extension: str,
24
+ reader_class: Type[FileReader],
25
+ allow_overwrite: bool = False,
26
+ ):
27
+ """
28
+ Register a file extension associated with a dataset reader
29
+
30
+ Parameters:
31
+ - extension: str, the file extension to register
32
+ - reader_class: Type[DataReader], the reader class to associate with the extension
33
+ - allow_overwrite: bool, if True, allows overwriting an existing reader for the extension
34
+ """
35
+ if extension in cls._readers and not allow_overwrite:
36
+ raise ValueError(
37
+ f"Reader for extension '{extension}' is already registered. "
38
+ f"Set allow_overwrite=True to replace the existing reader."
39
+ )
40
+ normalized_extension = extension.lower()
41
+ cls._readers[normalized_extension] = reader_class
42
+
43
+ @classmethod
44
+ def get_reader_class(cls, uri: str) -> Type[FileReader]:
45
+ """
46
+ Gets the reader class given a URI
47
+
48
+ :param uri: URI of file to be read. Note that we expect the URI to end in a file extension
49
+ :raises ValueError: if no registered data reader is found for the URI's extension type
50
+ """
51
+ # Find the file extension from the URI
52
+ extension = uri.split(".")[-1].lower()
53
+
54
+ # Return the reader class if the extension is registered, otherwise return None
55
+ return cls._readers.get(extension)
56
+
57
+ @classmethod
58
+ def construct_reader_instance(
59
+ cls,
60
+ sst_row: SSTableRow,
61
+ file_provider: FileProvider,
62
+ primary_key: str,
63
+ schema: Schema,
64
+ reader_cache: Dict[str, FileReader] = None,
65
+ ) -> FileReader:
66
+ """
67
+ Construct a data reader for an instance of a given uri
68
+
69
+ :param uri: URI of file to be read. Note that we expect the URI to end in a file extension
70
+ :param reader_cache: Optional cache of readers keyed by extension
71
+ :raises ValueError: if no registered data reader is found for the URI's extension type
72
+ """
73
+ extension = sst_row.uri.split(".")[-1].lower()
74
+
75
+ if reader_cache is not None and extension in reader_cache:
76
+ return reader_cache[extension]
77
+
78
+ reader_class = FileReaderRegistrar.get_reader_class(sst_row.uri)
79
+ reader_instance = reader_class(sst_row, file_provider, primary_key, schema)
80
+
81
+ if reader_cache:
82
+ reader_cache[extension] = reader_instance
83
+
84
+ return reader_instance
File without changes