deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Callable, Any
4
+
5
+ from deltacat.storage.rivulet.field_group import FieldGroup
6
+ from deltacat.storage.rivulet.mvp.Table import MvpTable
7
+ from deltacat.storage.rivulet import Schema
8
+ from deltacat.storage.rivulet.reader.data_scan import DataScan
9
+ from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
10
+ from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
11
+ from deltacat.storage.rivulet.reader.query_expression import QueryExpression
12
+
13
+
14
+ class DatasetExecutor:
15
+ """
16
+ Executor class which runs operations such as select, map, take, save
17
+
18
+ This class may store intermediate state while it is executing.
19
+
20
+ LogicalPlan is responsible for constructor an executor and ordering operations appropriately
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ field_groups: List[FieldGroup],
26
+ schema: Schema,
27
+ metastore: DatasetMetastore,
28
+ ):
29
+ self.effective_schema: Schema = schema.__deepcopy__()
30
+ self.field_groups = field_groups
31
+ self.output: MvpTable | None = None
32
+ self._metastore = metastore
33
+
34
+ def collect(self) -> MvpTable:
35
+ if not self.output:
36
+ self.output = self._read(self.effective_schema)
37
+ return self.output
38
+
39
+ def select(self, fields: List[str]) -> "DatasetExecutor":
40
+ """
41
+ Reads data and selects a subset of fields
42
+ Note that this implementation is super inefficient (does not push down filters to read, copies data to new MvpTable). That is OK since this will all be replaced
43
+ """
44
+ # Read data from original input sources if not already read
45
+ if not self.output:
46
+ self.output = self._read(self.effective_schema)
47
+ # Calculate effective schema and apply it to data
48
+ self.effective_schema.filter(fields)
49
+ self.output = MvpTable(
50
+ {
51
+ key: value
52
+ for key, value in self.output.data.items()
53
+ if key in self.effective_schema
54
+ },
55
+ )
56
+ return self
57
+
58
+ def map(self, transform: Callable[[Any], Any]) -> "DatasetExecutor":
59
+ raise NotImplementedError
60
+
61
+ def _read(self, schema: Schema) -> MvpTable:
62
+ """
63
+ Internal helper method to read data
64
+
65
+ TODO for now this is doing dumb in-memory implementation and later this is going to be replaced by rust library
66
+ """
67
+ if len(self.field_groups) == 1:
68
+ return self._read_as_mvp_table(schema, self.field_groups[0])
69
+ else:
70
+ ds1 = self._read_as_mvp_table(schema, self.field_groups[0])
71
+ ds2 = self._read_as_mvp_table(schema, self.field_groups[1])
72
+ merged = MvpTable.merge(ds1, ds2, schema.primary_key.name)
73
+ for i in range(2, len(self.field_groups)):
74
+ ds_i = self._read_as_mvp_table(schema, self.field_groups[i])
75
+ merged = MvpTable.merge(merged, ds_i, schema.primary_key.name)
76
+ return merged
77
+
78
+ def _read_as_mvp_table(self, schema: Schema, field_group: FieldGroup):
79
+ data = list(
80
+ DataScan(
81
+ schema, QueryExpression(), DatasetReader(self._metastore)
82
+ ).to_pydict()
83
+ )
84
+ output = {}
85
+ for key in schema.fields.keys():
86
+ output[key] = [d.get(key) for d in data]
87
+ return MvpTable(output)
@@ -0,0 +1,5 @@
1
+ # TODO later on this will be moved to a dedicated package
2
+ from deltacat.storage.rivulet.feather.file_reader import FeatherFileReader
3
+ from deltacat.storage.rivulet.reader.reader_type_registrar import FileReaderRegistrar
4
+
5
+ FileReaderRegistrar.register_reader("feather", FeatherFileReader)
@@ -0,0 +1,136 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional
4
+
5
+ import pyarrow.ipc
6
+ from pyarrow import RecordBatch, RecordBatchFileReader
7
+
8
+ from deltacat.storage.rivulet.fs.file_provider import FileProvider
9
+ from deltacat.storage.rivulet.metastore.sst import SSTableRow
10
+ from deltacat.storage.rivulet.reader.data_reader import (
11
+ RowAndKey,
12
+ FileReader,
13
+ FILE_FORMAT,
14
+ )
15
+ from deltacat.storage.rivulet.reader.pyarrow_data_reader import RecordBatchRowIndex
16
+ from deltacat.storage.rivulet.schema.schema import Schema
17
+
18
+
19
+ class FeatherFileReader(FileReader[RecordBatchRowIndex]):
20
+ """
21
+ Feather file reader. This class is not thread safe
22
+
23
+ This is mostly a copy-pasta from ParquetFileReader
24
+ TODO can consider abstracting code between this and ParquetFileReader
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ sst_row: SSTableRow,
30
+ file_provider: FileProvider,
31
+ primary_key: str,
32
+ schema: Schema,
33
+ ):
34
+ self.sst_row = sst_row
35
+ self.input = file_provider.provide_input_file(sst_row.uri)
36
+
37
+ self.key = primary_key
38
+ self.feather_file = sst_row.uri
39
+
40
+ self.schema = schema
41
+
42
+ # Iterator from pyarrow iter_batches API call. Pyarrow manages state of traversal within parquet row groups
43
+
44
+ """
45
+ These variables keep state about where the iterator is current at. They are initialized in __enter__()
46
+ """
47
+ self._curr_batch: RecordBatch | None = None
48
+ self._feather_reader: RecordBatchFileReader | None = None
49
+ # Arrow only lets you read feather files chunk by chunk using
50
+ # RecordBatchFileReader.get_batch(index)
51
+ self._curr_batch_index = 0
52
+ self._curr_row_offset = 0
53
+ self._pk_col = None
54
+
55
+ def peek(self) -> Optional[RowAndKey[FILE_FORMAT]]:
56
+ """
57
+ Peek next record
58
+
59
+ Note that there is an edge case where peek() is called on the bounary between record batches
60
+ This only happens curr_row_offset == curr_batch.num_rows, meaning next() or peek() would need to advance
61
+ to the next record batch. When this happens, peek() increments _curr_batch and sets _curr_row_offset to 0
62
+
63
+ :return: Optional of RowAndPrimaryKey
64
+ """
65
+ if not self.__is_initialized():
66
+ raise RuntimeError(
67
+ "ParquetFileReader must be initialized with __enter__ before reading"
68
+ )
69
+
70
+ if self.__need_to_advance_record_batch():
71
+ try:
72
+ self.__advance_record_batch()
73
+ except StopIteration:
74
+ return None
75
+
76
+ pk = self._pk_col[self._curr_row_offset].as_py()
77
+ return RowAndKey(
78
+ RecordBatchRowIndex(self._curr_batch, self._curr_row_offset), pk
79
+ )
80
+
81
+ def __next__(self) -> RowAndKey[FILE_FORMAT]:
82
+ if not self.__is_initialized():
83
+ raise RuntimeError(
84
+ "ParquetFileReader must be initialized with __enter__ before reading"
85
+ )
86
+
87
+ if self.__need_to_advance_record_batch():
88
+ self.__advance_record_batch()
89
+ pk = self._pk_col[0].as_py()
90
+ return RowAndKey(RecordBatchRowIndex(self._curr_batch, 0), pk)
91
+ else:
92
+ pk = self._pk_col[self._curr_row_offset].as_py()
93
+ offset = self._curr_row_offset
94
+ self._curr_row_offset += 1
95
+ return RowAndKey(RecordBatchRowIndex(self._curr_batch, offset), pk)
96
+
97
+ def __enter__(self):
98
+ with self.input.open() as f:
99
+ self._feather_reader = pyarrow.ipc.RecordBatchFileReader(f)
100
+ self.__advance_record_batch()
101
+
102
+ def __exit__(self, __exc_type, __exc_value, __traceback):
103
+ self.close()
104
+ # return False to propagate up error messages
105
+ return False
106
+
107
+ def close(self):
108
+ # no op
109
+ return
110
+
111
+ def __is_initialized(self):
112
+ return self._curr_batch and self._pk_col
113
+
114
+ def __need_to_advance_record_batch(self):
115
+ return not self._curr_row_offset < self._curr_batch.num_rows
116
+
117
+ def __advance_record_batch(self):
118
+ """
119
+ Advance to next record batch
120
+ :raise StopIteration: If there are no more record batches
121
+ """
122
+ try:
123
+ self._curr_batch = self._feather_reader.get_batch(self._curr_batch_index)
124
+ self._curr_batch_index += 1
125
+ self._curr_row_offset = 0
126
+ self._pk_col = self._curr_batch[self.key]
127
+ # Filter the batch to only include fields in the schema
128
+ # Pyarrow select will throw a ValueError if the field is not in the schema
129
+ fields = [
130
+ field
131
+ for field in self.schema.keys()
132
+ if field in self._curr_batch.schema.names
133
+ ]
134
+ self._curr_batch = self._curr_batch.select(fields)
135
+ except ValueError:
136
+ raise StopIteration(f"Ended iteration at batch {self._curr_batch_index}")
@@ -0,0 +1,35 @@
1
+ from typing import List
2
+
3
+ import pyarrow as pa
4
+ from pyarrow import feather
5
+
6
+ from deltacat.storage.rivulet.metastore.sst import SSTableRow
7
+ from deltacat.storage.rivulet import Schema
8
+ from deltacat.storage.rivulet.arrow.serializer import ArrowSerializer
9
+ from deltacat.storage.rivulet.fs.file_provider import FileProvider
10
+
11
+
12
+ class FeatherDataSerializer(ArrowSerializer):
13
+ """
14
+ Feather data writer. Responsible for flushing rows to feather files and returning SSTable rows for any file(s) written
15
+
16
+ TODO Support recording byte range offsets. Deferring this for now
17
+ We may need to provide a wrapper class over fsspec which introspects how many bytes written
18
+ when .write is called on output stream
19
+ """
20
+
21
+ def __init__(self, file_provider: FileProvider, schema: Schema):
22
+ super().__init__(file_provider, schema)
23
+
24
+ def serialize(self, table: pa.Table) -> List[SSTableRow]:
25
+ file = self.file_provider.provide_data_file("feather")
26
+
27
+ with file.create() as outfile:
28
+ # Note that write_feather says that dest is a string, but it is really any object implementing write()
29
+ feather.write_feather(table, dest=outfile)
30
+
31
+ # Because we only write one row group, it only creates one SSTableRow
32
+ # we may have more granular SST indexes for other formats
33
+ key_min, key_max = self._get_min_max_key(table)
34
+ # TODO need to populate byte offsets. For now, we are writing single files per SSTableRow
35
+ return [SSTableRow(key_min, key_max, file.location, 0, 0)]
File without changes
@@ -0,0 +1,105 @@
1
+ import posixpath
2
+ import time
3
+ from typing import List, Generator
4
+
5
+ from deltacat.storage.model.partition import PartitionLocator
6
+ from deltacat.storage.rivulet.fs.file_store import FileStore
7
+ from deltacat.storage.rivulet.fs.input_file import InputFile
8
+ from deltacat.storage.rivulet.fs.output_file import OutputFile
9
+ from deltacat.utils.metafile_locator import _find_partition_path
10
+
11
+
12
+ class FileProvider:
13
+ """
14
+ Manages the generation of URIs for data and metadata files and facilitates the creation of files at those URIs.
15
+ All files are generated relative to the root of the storage location.
16
+
17
+ This class is inspired by the Iceberg `LocationProvider` and provides methods
18
+ to generate paths for various types of files (e.g., data files, SSTs, and manifests)
19
+ while maintaining a clear structure within the dataset.
20
+
21
+ TODO (deltacat): FileProvider will be replaced/refactored once we are able to integrate with Deltacat.
22
+ TODO: Incorporate additional file naming conventions, such as including
23
+ partitionId, taskId, and operationId, to improve traceability and
24
+ idempotency.
25
+ """
26
+
27
+ uri: str
28
+
29
+ def __init__(self, uri: str, locator: PartitionLocator, file_store: FileStore):
30
+ """
31
+ Initializes the file provider.
32
+
33
+ param: uri: Base URI of the dataset.
34
+ param: file_store: FileStore instance for creating and reading files.
35
+ """
36
+ self.uri = uri
37
+ self._locator = locator
38
+ self._file_store = file_store
39
+
40
+ def provide_data_file(self, extension: str) -> OutputFile:
41
+ """
42
+ Creates a new data file.
43
+
44
+ TODO: Ensure storage interface can provide data files.
45
+
46
+ param: extension: File extension (e.g., "parquet").
47
+ returns: OutputFile instance pointing to the created data file.
48
+ """
49
+ partition_path = _find_partition_path(self.uri, self._locator)
50
+ uri = posixpath.join(
51
+ partition_path, "data", f"{int(time.time_ns())}.{extension}"
52
+ )
53
+ return self._file_store.create_output_file(uri)
54
+
55
+ def provide_l0_sst_file(self) -> OutputFile:
56
+ """
57
+ Creates a new L0 SST file.
58
+
59
+ TODO: Ensure storage interface can provide sst files.
60
+
61
+ returns: OutputFile instance pointing to the created SST file.
62
+ """
63
+ partition_path = _find_partition_path(self.uri, self._locator)
64
+ uri = posixpath.join(
65
+ partition_path, "metadata", "ssts", "0", f"{int(time.time_ns())}.json"
66
+ )
67
+ return self._file_store.create_output_file(uri)
68
+
69
+ def provide_input_file(self, uri: str) -> InputFile:
70
+ """
71
+ Reads an existing file.
72
+
73
+ param sst_uri: URI of the file to read.
74
+ returns: InputFile instance for the specified URI.
75
+ """
76
+ return self._file_store.create_input_file(uri)
77
+
78
+ def provide_manifest_file(self) -> OutputFile:
79
+ """
80
+ Creates a new manifest file.
81
+
82
+ returns: OutputFile instance pointing to the created manifest file.
83
+ """
84
+ uri = f"{self.uri}/metadata/manifests/{int(time.time_ns())}.json"
85
+ return self._file_store.create_output_file(uri)
86
+
87
+ def get_sst_scan_directories(self) -> List[str]:
88
+ """
89
+ Retrieves SST scan directories.
90
+
91
+ returns: List of directories containing SSTs.
92
+ """
93
+ partition_path = _find_partition_path(self.uri, self._locator)
94
+ return [f"{partition_path}/metadata/ssts/0/"]
95
+
96
+ def generate_sst_uris(self) -> Generator[InputFile, None, None]:
97
+ """
98
+ Generates all SST URIs.
99
+
100
+ returns: Generator of InputFile instances for SSTs.
101
+ """
102
+ sst_directories = self.get_sst_scan_directories()
103
+ for directory in sst_directories:
104
+ for file in self._file_store.list_files(directory):
105
+ yield file
@@ -0,0 +1,130 @@
1
+ from typing import Tuple, Iterator, Optional
2
+ from pyarrow.fs import FileSystem, FileType, FileSelector
3
+
4
+ # TODO(deltacat): Rely on deltacat implementation to resolve path and filesystem.
5
+ from ray.data.datasource.path_util import _resolve_paths_and_filesystem
6
+
7
+ from deltacat.storage.rivulet.fs.input_file import FSInputFile
8
+ from deltacat.storage.rivulet.fs.output_file import FSOutputFile
9
+
10
+
11
+ class FileStore:
12
+ """
13
+ Manages the filesystem and low-level file operations.
14
+ This class is designed to work with any filesystem supported by PyArrow; local, S3, HDFS, GCP,
15
+ and other fsspec-compatible filesystems.
16
+
17
+ TODO: Add better error consolidation between filesystems. Will be handled by deltacat implementation?
18
+
19
+ method: `filesystem`: Resolves and normalizes a given path and filesystem.
20
+ method: `file_exists`: Checks if a file exists at a given URI.
21
+ method: `create_file`: Creates a new file for writing at a specified URI.
22
+ method: `read_file`: Reads an existing file from a specified URI.
23
+ method: `list_files`: Lists all files within a specified directory URI.
24
+ """
25
+
26
+ def __init__(self, path: str, filesystem: Optional[FileSystem] = None):
27
+ """
28
+ Serves as the source of truth for all file operations, ensuring that
29
+ all paths and operations are relative to the specified filesystem,
30
+ providing consistency and compatibility across fsspec supported backends.
31
+
32
+ TODO (deltacat): maybe rely on deltacat catalog as a source of truth for rivulet filesystem.
33
+
34
+ param: path (str): The base URI or path for the filesystem.
35
+ param: filesystem (FileSystem): A PyArrow filesystem instance.
36
+ """
37
+ _, fs = FileStore.filesystem(path, filesystem)
38
+ self.filesystem = filesystem or fs
39
+
40
+ @staticmethod
41
+ def filesystem(
42
+ path: str, filesystem: Optional[FileSystem] = None
43
+ ) -> Tuple[str, FileSystem]:
44
+ """
45
+ Resolves and normalizes the given path and filesystem.
46
+
47
+ param: path (str): The URI or path to resolve.
48
+ param: filesystem (Optional[FileSystem]): An optional filesystem instance.
49
+ returns: Tuple[str, FileSystem]: The normalized path and filesystem.
50
+ raises: AssertionError: If multiple paths are resolved.
51
+ """
52
+ paths, filesystem = _resolve_paths_and_filesystem(
53
+ paths=path, filesystem=filesystem
54
+ )
55
+ assert len(paths) == 1, "Multiple paths not supported"
56
+ return paths[0], filesystem
57
+
58
+ def file_exists(
59
+ self, data_uri: str, filesystem: Optional[FileSystem] = None
60
+ ) -> bool:
61
+ """
62
+ Checks if a file exists at the specified URI.
63
+
64
+ param: data_uri (str): The URI of the file to check.
65
+ param: filesystem (Optional[FileSystem]): Filesystem to use. Defaults to the instance filesystem.
66
+ returns: bool: True if the file exists, False otherwise.
67
+ """
68
+ path, filesystem = FileStore.filesystem(data_uri, filesystem or self.filesystem)
69
+ return filesystem.get_file_info(path).type != FileType.NotFound
70
+
71
+ def create_output_file(
72
+ self, data_uri: str, filesystem: Optional[FileSystem] = None
73
+ ) -> FSOutputFile:
74
+ """
75
+ Creates a new output file for writing at the specified URI.
76
+
77
+ param: data_uri (str): The URI where the file will be created.
78
+ param: filesystem (Optional[FileSystem]): Filesystem to use. Defaults to the instance filesystem.
79
+ returns: FSOutputFile: An object for writing to the file.
80
+ raises: IOError: If file creation fails.
81
+ """
82
+ try:
83
+ path, filesystem = FileStore.filesystem(
84
+ data_uri, filesystem or self.filesystem
85
+ )
86
+ return FSOutputFile(path, filesystem)
87
+ except Exception as e:
88
+ raise IOError(f"Failed to create file '{data_uri}': {e}")
89
+
90
+ def create_input_file(
91
+ self, data_uri: str, filesystem: Optional[FileSystem] = None
92
+ ) -> FSInputFile:
93
+ """
94
+ Create a new input file for reading at the specified URI.
95
+
96
+ param: data_uri (str): The URI of the file to read.
97
+ param: filesystem (Optional[FileSystem]): Filesystem to use. Defaults to the instance filesystem.
98
+ returns: FSInputFile: An object for reading the file.
99
+ raises: IOError: If file reading fails.
100
+ """
101
+ try:
102
+ path, filesystem = FileStore.filesystem(
103
+ data_uri, filesystem or self.filesystem
104
+ )
105
+ return FSInputFile(path, filesystem)
106
+ except Exception as e:
107
+ raise IOError(f"Failed to read file '{data_uri}': {e}")
108
+
109
+ def list_files(
110
+ self, data_uri: str, filesystem: Optional[FileSystem] = None
111
+ ) -> Iterator[FSInputFile]:
112
+ """
113
+ Lists all files in the specified directory URI.
114
+
115
+ param: data_uri (str): The URI of the directory to list files from.
116
+ param: filesystem (Optional[FileSystem]): Filesystem to use. Defaults to the instance filesystem.
117
+ returns: Iterator[FSInputFile]: An iterator of FSInputFile objects representing the files.
118
+ raises: IOError: If listing files fails.
119
+ """
120
+ try:
121
+ path, filesystem = FileStore.filesystem(
122
+ data_uri, filesystem or self.filesystem
123
+ )
124
+ file_info = filesystem.get_file_info(FileSelector(path, recursive=False))
125
+
126
+ for file in file_info:
127
+ if file.type == FileType.File:
128
+ yield FSInputFile(file.path, filesystem)
129
+ except Exception as e:
130
+ raise IOError(f"Failed to list files in '{data_uri}': {e}")
@@ -0,0 +1,76 @@
1
+ from contextlib import contextmanager
2
+ import io
3
+ from abc import ABC, abstractmethod
4
+ from typing import Protocol
5
+
6
+ from pyarrow.fs import FileSystem, FileType
7
+
8
+
9
+ class InputStream(Protocol):
10
+ """A protocol with a subset of IOBase for file-like input objects"""
11
+
12
+ @abstractmethod
13
+ def read(self, size: int = -1) -> bytes:
14
+ ...
15
+
16
+ @abstractmethod
17
+ def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
18
+ ...
19
+
20
+ @abstractmethod
21
+ def tell(self) -> int:
22
+ ...
23
+
24
+ @abstractmethod
25
+ def close(self) -> None:
26
+ ...
27
+
28
+ def __enter__(self) -> "InputStream":
29
+ ...
30
+
31
+ @abstractmethod
32
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
33
+ ...
34
+
35
+
36
+ class InputFile(ABC):
37
+ """Abstraction for interacting with input files"""
38
+
39
+ def __init__(self, location: str):
40
+ self._location = location
41
+
42
+ @property
43
+ def location(self) -> str:
44
+ return self._location
45
+
46
+ @abstractmethod
47
+ def exists(self) -> bool:
48
+ """Return whether the location exists.
49
+
50
+ Raises:
51
+ PermissionError: If this has insufficient permissions to access the file at location.
52
+ """
53
+
54
+ @abstractmethod
55
+ def open(self) -> InputStream:
56
+ """Return a file-like object for input
57
+
58
+ Raises:
59
+ FileNotFoundError: If the file does not exist at self.location.
60
+ PermissionError: If this has insufficient permissions to access the file at location.
61
+ """
62
+
63
+
64
+ class FSInputFile(InputFile):
65
+ def __init__(self, location: str, fs: FileSystem):
66
+ self._location = location
67
+ self.fs = fs
68
+
69
+ def exists(self) -> bool:
70
+ file_info = self.fs.get_file_info(self._location)
71
+ return file_info.type != FileType.NotFound
72
+
73
+ @contextmanager
74
+ def open(self, seekable: bool = False):
75
+ with self.fs.open_input_file(self._location) as input_stream:
76
+ yield input_stream
@@ -0,0 +1,86 @@
1
+ from abc import ABC, abstractmethod
2
+ from contextlib import contextmanager
3
+ import posixpath
4
+ from typing import Protocol
5
+
6
+ from pyarrow.fs import FileSystem, FileType
7
+
8
+ from deltacat.storage.rivulet.fs.input_file import FSInputFile, InputFile
9
+
10
+
11
+ class OutputStream(Protocol): # pragma: no cover
12
+ """A protocol with a subset of IOBase for file-like output objects"""
13
+
14
+ @abstractmethod
15
+ def write(self, b: bytes) -> int:
16
+ ...
17
+
18
+ @abstractmethod
19
+ def close(self) -> None:
20
+ ...
21
+
22
+ @abstractmethod
23
+ def __enter__(self) -> "OutputStream":
24
+ ...
25
+
26
+ @abstractmethod
27
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
28
+ ...
29
+
30
+
31
+ class OutputFile(ABC):
32
+ """Abstraction for interacting with output files"""
33
+
34
+ def __init__(self, location: str):
35
+ self._location = location
36
+
37
+ @property
38
+ def location(self) -> str:
39
+ return self._location
40
+
41
+ @abstractmethod
42
+ def exists(self) -> bool:
43
+ """Return whether the location exists.
44
+
45
+ Raises:
46
+ PermissionError: If this has insufficient permissions to access the file at location.
47
+ """
48
+
49
+ @abstractmethod
50
+ def to_input_file(self) -> InputFile:
51
+ """Return an InputFile for this output file's location"""
52
+
53
+ @abstractmethod
54
+ def create(self) -> OutputStream:
55
+ """Return a file-like object for output
56
+
57
+ TODO: overwrite protection (FileExistsError?)
58
+ Raises:
59
+ PermissionError: If this has insufficient permissions to access the file at location.
60
+ """
61
+
62
+
63
+ class FSOutputFile(OutputFile):
64
+ def __init__(self, location: str, fs: FileSystem):
65
+ self._location = location
66
+ self.fs = fs
67
+
68
+ def exists(self) -> bool:
69
+ file_info = self.fs.get_file_info(self._location)
70
+ return file_info.type != FileType.NotFound
71
+
72
+ def to_input_file(self) -> "FSInputFile":
73
+ return FSInputFile(self._location, self.fs)
74
+
75
+ @contextmanager
76
+ def create(self):
77
+ """Create and open the file for writing."""
78
+ try:
79
+ parent_dir = posixpath.dirname(self._location)
80
+ if parent_dir: # Check if there's a parent directory to create
81
+ self.fs.create_dir(parent_dir, recursive=True)
82
+
83
+ with self.fs.open_output_stream(self._location) as output_stream:
84
+ yield output_stream
85
+ except Exception as e:
86
+ raise IOError(f"Failed to create or write to file '{self._location}': {e}")