deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,744 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import itertools
5
+ import posixpath
6
+ from typing import Dict, List, Optional, Tuple, Iterable, Iterator
7
+
8
+ import pyarrow.fs
9
+ import pyarrow as pa
10
+ import pyarrow.dataset
11
+ import pyarrow.json
12
+ import pyarrow.csv
13
+ import pyarrow.parquet
14
+
15
+ from deltacat.constants import (
16
+ DEFAULT_NAMESPACE,
17
+ DEFAULT_PARTITION_ID,
18
+ DEFAULT_PARTITION_VALUES,
19
+ DEFAULT_STREAM_ID,
20
+ DEFAULT_TABLE_VERSION,
21
+ )
22
+ from deltacat.storage.model.partition import Partition, PartitionLocator
23
+ from deltacat.storage.model.shard import Shard, ShardingStrategy
24
+ from deltacat.storage.model.stream import Stream, StreamLocator
25
+ from deltacat.storage.model.transaction import TransactionOperationList
26
+ from deltacat.storage.model.types import CommitState, StreamFormat
27
+ from deltacat.storage.rivulet.fs.file_store import FileStore
28
+ from deltacat.storage.rivulet.fs.file_provider import FileProvider
29
+ from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
30
+ from deltacat.storage.rivulet import Schema, Field
31
+ from deltacat.utils.export import export_dataset
32
+ from .schema.schema import Datatype
33
+
34
+ from deltacat.storage.rivulet.reader.data_scan import DataScan
35
+ from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
36
+ from deltacat.storage.rivulet.reader.query_expression import QueryExpression
37
+
38
+ from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter
39
+ from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
40
+ MemtableDatasetWriter,
41
+ )
42
+
43
+ from deltacat.storage import (
44
+ Namespace,
45
+ NamespaceLocator,
46
+ Table,
47
+ TableLocator,
48
+ TableVersion,
49
+ TableVersionLocator,
50
+ Transaction,
51
+ TransactionType,
52
+ TransactionOperation,
53
+ TransactionOperationType,
54
+ )
55
+ from deltacat import logs
56
+
57
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
58
+
59
+
60
+ # These are the hardcoded default schema names
61
+ ALL = "all"
62
+ DEFAULT = "default"
63
+
64
+
65
+ class FieldsAccessor:
66
+ """Accessor class used to make it easy to do actions like dataset.fields['name'] to work with fields in the Dataset.
67
+ All field mutation and access should come through this class, or through the public helper functions in the dataset
68
+ class, e.g. 'add_fields()'.
69
+ """
70
+
71
+ def __init__(self, dataset: Dataset):
72
+ self.dataset = dataset
73
+
74
+ def __getitem__(self, field_name: str) -> Field:
75
+ if field_name not in self.dataset.schemas[ALL]:
76
+ raise KeyError(f"Field '{field_name}' not found in dataset.")
77
+ return self.dataset.schemas[ALL][field_name]
78
+
79
+ def __setitem__(self, field_name: str, field: Field):
80
+ if not isinstance(field, Field):
81
+ raise TypeError("Value must be a Field object")
82
+ self.dataset.schemas[ALL][field_name] = field
83
+
84
+ def __delitem__(self, field_name: str):
85
+ if field_name not in self.dataset.schemas[ALL]:
86
+ raise ValueError(f"Field '{field_name}' does not exist.")
87
+ del self.dataset.schemas[ALL][field_name]
88
+ for schema in self.dataset._schemas.values():
89
+ if field_name in schema:
90
+ del schema[field_name]
91
+
92
+ def __contains__(self, field_name: str) -> bool:
93
+ """Allows 'field_name in dataset.fields' checks."""
94
+ return field_name in self.dataset.schemas[ALL]
95
+
96
+ def __iter__(self):
97
+ return iter(self.dataset.schemas[ALL].items())
98
+
99
+ def __len__(self):
100
+ return len(self.dataset.schemas[ALL])
101
+
102
+ def __repr__(self):
103
+ return f"Fields({list(self.dataset.schemas['all'].keys())})"
104
+
105
+ def add(
106
+ self,
107
+ name: str,
108
+ datatype: Datatype,
109
+ *,
110
+ schema_name: str = DEFAULT,
111
+ is_merge_key: bool = False,
112
+ ):
113
+ """Simple helper to add a field when you don't have a Field object"""
114
+ self.dataset.add_fields(
115
+ fields=[(name, datatype)],
116
+ schema_name=schema_name,
117
+ merge_keys=[name] if is_merge_key else None,
118
+ )
119
+
120
+
121
+ class SchemasAccessor:
122
+ """Accessor class used to make it easy to do actions like dataset.schemas['all'] to work with schemas in the Dataset.
123
+ All schema mutation and access should come through this class, or through the public helper functions in the dataset
124
+ class, e.g. 'add_fields()'.
125
+ """
126
+
127
+ def __init__(self, dataset: Dataset):
128
+ self.dataset = dataset
129
+
130
+ def __getitem__(self, name: str) -> Schema:
131
+ if name not in self.dataset._schemas:
132
+ raise KeyError(f"Schema '{name}' not found.")
133
+ return self.dataset._schemas[name]
134
+
135
+ def __setitem__(self, schema_name: str, field_names: List[str]) -> None:
136
+ self.dataset._add_fields_to_schema(
137
+ field_names=field_names, schema_name=schema_name
138
+ )
139
+
140
+ def __delitem__(self, schema_name: str) -> None:
141
+ if schema_name not in self.dataset._schemas:
142
+ raise ValueError(f"Schema '{schema_name}' does not exist.")
143
+ if schema_name == ALL:
144
+ raise ValueError("Cannot remove the 'all' schema.")
145
+ del self.dataset._schemas[schema_name]
146
+
147
+ def __contains__(self, schema_name: str) -> bool:
148
+ return schema_name in self.dataset._schemas
149
+
150
+ def __iter__(self) -> Iterator[str]:
151
+ return iter(self.dataset._schemas.keys())
152
+
153
+ def __len__(self) -> int:
154
+ return len(self.dataset._schemas)
155
+
156
+ def __repr__(self) -> str:
157
+ return f"SchemasAccessor({list(self.dataset._schemas.keys())})"
158
+
159
+
160
+ class Dataset:
161
+ def __init__(
162
+ self,
163
+ *,
164
+ dataset_name: str,
165
+ metadata_uri: Optional[str] = None,
166
+ schema: Optional[Schema] = None,
167
+ schema_name: Optional[str] = None,
168
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
169
+ namespace: Optional[str] = DEFAULT_NAMESPACE,
170
+ ):
171
+ """
172
+ Create an empty Dataset w/ optional schema. This method is typically only used for small datasets that are manually created.
173
+ Use the Dataset.from_*() to create a dataset from existing data.
174
+
175
+ Args:
176
+ dataset_name: Unique identifier for the dataset.
177
+ metadata_uri: The directory to store the _metadata_folder ('.riv-meta-{dataset_name}') containing dataset metadata.
178
+ If not provided, we'll use the local directory.
179
+
180
+ Private Attributes:
181
+ _metadata_folder (str):
182
+ The folder name where metadata for the dataset is kept. It will always be
183
+ '.riv-meta-{dataset_name}', and be stored under `metadata_uri`.
184
+ _schemas (dict[str, Schema]):
185
+ Maps a schemas by name (e.g., "default", "analytics"). This is how fields in the dataset are grouped and accessed.
186
+ _file_store (FileStore):
187
+ The FileStore used by the Dataset class for reading and writing metadata files.
188
+ _file_provider (FileProvider):
189
+ Used to resolve file URIs within the `_file_store`.
190
+ _metastore (DatasetMetastore):
191
+ Uses the _file_store and _file_provider to manage metadata (schema, stats, file locations, manifests, etc.) for this Dataset.
192
+ """
193
+ if not dataset_name or not isinstance(dataset_name, str):
194
+ raise ValueError("Name must be a non-empty string")
195
+
196
+ self.dataset_name = dataset_name
197
+ self._schemas: Dict[str, Schema] = {ALL: Schema()}
198
+
199
+ self._metadata_folder = f".riv-meta-{dataset_name}"
200
+ path, filesystem = FileStore.filesystem(
201
+ metadata_uri or self._metadata_folder, filesystem
202
+ )
203
+ self._metadata_path = posixpath.join(path, self._metadata_folder)
204
+
205
+ self._table_name = dataset_name
206
+ self._table_version = DEFAULT_TABLE_VERSION
207
+ self._namespace = namespace
208
+ self._partition_id = DEFAULT_PARTITION_ID
209
+
210
+ self._create_metadata_directories()
211
+
212
+ # TODO: remove locator state here. The deltacat catalog and
213
+ # storage interface should remove the need to pass around locator state
214
+ self._locator = PartitionLocator.at(
215
+ namespace=self._namespace,
216
+ table_name=self.dataset_name,
217
+ table_version=self._table_version,
218
+ stream_id=DEFAULT_STREAM_ID,
219
+ stream_format=StreamFormat.DELTACAT,
220
+ partition_values=DEFAULT_PARTITION_VALUES,
221
+ partition_id=self._partition_id,
222
+ )
223
+
224
+ self._file_store = FileStore(self._metadata_path, filesystem)
225
+ self._file_provider = FileProvider(
226
+ self._metadata_path, self._locator, self._file_store
227
+ )
228
+
229
+ self._metastore = DatasetMetastore(
230
+ self._metadata_path, self._file_provider, self._locator
231
+ )
232
+
233
+ self.fields = FieldsAccessor(self)
234
+ self.schemas = SchemasAccessor(self)
235
+
236
+ if schema:
237
+ self.add_schema(schema, schema_name=schema_name)
238
+
239
+ def _create_metadata_directories(self) -> List[str]:
240
+ """
241
+ Creates rivulet metadata files using deltacat transactions.
242
+ This is a temporary solution until deltacat storage is integrated.
243
+
244
+ {CATALOG_ROOT}/
245
+ ├── {NAMESPACE_ID}/
246
+ │ ├── {TABLE_ID}/
247
+ │ │ ├── {TABLE_VERSION}/
248
+ │ │ │ ├── {STREAM}/
249
+ │ │ │ │ ├── {PARTITION}/
250
+ │ │ │ │ │ ├── {DELTA}/
251
+ │ │ │ │ │ │ ├── rev/
252
+ │ │ │ │ │ │ │ ├── 00000000000000000001_create_<txn_id>.mpk # Delta Metafile
253
+ │ │ │ │ │ └── ...
254
+
255
+ Currently, we assume **fixed** values for:
256
+ - Table Version → "table_version"
257
+ - Stream → "stream"
258
+ - Partition → "partition"
259
+
260
+ TODO this will be replaced with Deltacat Storage interface - https://github.com/ray-project/deltacat/issues/477
261
+ TODO: Consider how to support **dynamic values** for these entities.
262
+ """
263
+ metafiles = [
264
+ Namespace.of(locator=NamespaceLocator.of(namespace=self._namespace)),
265
+ Table.of(
266
+ locator=TableLocator.at(self._namespace, self.dataset_name),
267
+ description=f"Table for {self.dataset_name}",
268
+ ),
269
+ TableVersion.of(
270
+ locator=TableVersionLocator.at(
271
+ self._namespace, self.dataset_name, self._table_version
272
+ ),
273
+ schema=None,
274
+ ),
275
+ Stream.of(
276
+ locator=StreamLocator.at(
277
+ namespace=self._namespace,
278
+ table_name=self.dataset_name,
279
+ table_version=self._table_version,
280
+ stream_id=DEFAULT_STREAM_ID,
281
+ stream_format=StreamFormat.DELTACAT,
282
+ ),
283
+ partition_scheme=None,
284
+ state=CommitState.STAGED,
285
+ previous_stream_id=None,
286
+ watermark=None,
287
+ ),
288
+ Partition.of(
289
+ locator=PartitionLocator.at(
290
+ namespace=self._namespace,
291
+ table_name=self.dataset_name,
292
+ table_version=self._table_version,
293
+ stream_id=DEFAULT_STREAM_ID,
294
+ stream_format=StreamFormat.DELTACAT,
295
+ partition_values=DEFAULT_PARTITION_VALUES,
296
+ partition_id=self._partition_id,
297
+ ),
298
+ schema=None,
299
+ content_types=None,
300
+ ),
301
+ ]
302
+
303
+ txn_operations = [
304
+ TransactionOperation.of(
305
+ operation_type=TransactionOperationType.CREATE, dest_metafile=meta
306
+ )
307
+ for meta in metafiles
308
+ ]
309
+
310
+ transaction = Transaction.of(
311
+ txn_type=TransactionType.APPEND,
312
+ txn_operations=TransactionOperationList.of(txn_operations),
313
+ )
314
+
315
+ try:
316
+ paths = transaction.commit(self._metadata_path)[0]
317
+ return paths
318
+ except Exception as e:
319
+ # TODO: Have deltacat storage interface handle transaction errors.
320
+ error_message = str(e).lower()
321
+ if "already exists" in error_message:
322
+ logger.debug(f"Skipping creation: {e}")
323
+ return []
324
+ else:
325
+ raise
326
+
327
+ @classmethod
328
+ def from_parquet(
329
+ cls,
330
+ name: str,
331
+ file_uri: str,
332
+ merge_keys: str | Iterable[str],
333
+ metadata_uri: Optional[str] = None,
334
+ schema_mode: str = "union",
335
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
336
+ namespace: str = DEFAULT_NAMESPACE,
337
+ ) -> Dataset:
338
+ """
339
+ Create a Dataset from parquet files.
340
+
341
+ TODO: Make pluggable(from_x) with other file formats.
342
+
343
+ Args:
344
+ name: Unique identifier for the dataset.
345
+ metadata_uri: Base URI for the dataset, where dataset metadata is stored. If not specified, will be placed in ${file_uri}/riv-meta
346
+ file_uri: Path to parquet file(s)
347
+ merge_keys: Fields to specify as merge keys for future 'zipper merge' operations on the dataset
348
+ schema_mode: Schema combination mode. Options:
349
+ - 'union': Use unified schema with all columns
350
+ - 'intersect': Use only common columns across files
351
+
352
+ Returns:
353
+ Dataset: New dataset instance with the schema automatically inferred from the source parquet files
354
+ """
355
+ # TODO: integrate this with filesystem from deltacat catalog
356
+ file_uri, file_fs = FileStore.filesystem(file_uri, filesystem=filesystem)
357
+ if metadata_uri is None:
358
+ metadata_uri = posixpath.join(posixpath.dirname(file_uri), "riv-meta")
359
+ else:
360
+ metadata_uri, metadata_fs = FileStore.filesystem(
361
+ metadata_uri, filesystem=filesystem
362
+ )
363
+
364
+ # TODO: when integrating deltacat consider if we can support multiple filesystems
365
+ if file_fs.type_name != metadata_fs.type_name:
366
+ raise ValueError(
367
+ "File URI and metadata URI must be on the same filesystem."
368
+ )
369
+ pyarrow_dataset = pyarrow.dataset.dataset(file_uri, filesystem=file_fs)
370
+
371
+ if schema_mode == "intersect":
372
+ schemas = []
373
+ for file in pyarrow_dataset.files:
374
+ with file_fs.open_input_file(file) as f:
375
+ schema = pyarrow.parquet.read_schema(f)
376
+ schemas.append(schema)
377
+
378
+ common_columns = set(schemas[0].names)
379
+ for schema in schemas[1:]:
380
+ common_columns.intersection_update(schema.names)
381
+
382
+ intersect_schema = pa.schema(
383
+ [(name, schemas[0].field(name).type) for name in common_columns]
384
+ )
385
+ pyarrow_schema = intersect_schema
386
+ else:
387
+ schemas = []
388
+ for file in pyarrow_dataset.files:
389
+ with file_fs.open_input_file(file) as f:
390
+ schema = pyarrow.parquet.read_schema(f)
391
+ schemas.append(schema)
392
+ pyarrow_schema = pa.unify_schemas(schemas)
393
+
394
+ dataset_schema = Schema.from_pyarrow(pyarrow_schema, merge_keys)
395
+
396
+ # TODO the file URI never gets stored/saved, do we need to do so?
397
+ dataset = cls(
398
+ dataset_name=name,
399
+ metadata_uri=metadata_uri,
400
+ schema=dataset_schema,
401
+ filesystem=file_fs,
402
+ namespace=namespace,
403
+ )
404
+
405
+ # TODO: avoid write! associate fields with their source data.
406
+ writer = dataset.writer()
407
+
408
+ for batch in pyarrow_dataset.scanner().to_batches():
409
+ writer.write(batch)
410
+ writer.flush()
411
+
412
+ return dataset
413
+
414
+ @classmethod
415
+ def from_json(
416
+ cls,
417
+ name: str,
418
+ file_uri: str,
419
+ merge_keys: str | Iterable[str],
420
+ metadata_uri: Optional[str] = None,
421
+ schema_mode: str = "union",
422
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
423
+ namespace: str = DEFAULT_NAMESPACE,
424
+ ) -> "Dataset":
425
+ """
426
+ Create a Dataset from a single JSON file.
427
+
428
+ TODO: Add support for reading directories with multiple JSON files.
429
+
430
+ Args:
431
+ name: Unique identifier for the dataset.
432
+ metadata_uri: Base URI for the dataset, where dataset metadata is stored. If not specified, will be placed in ${file_uri}/riv-meta
433
+ file_uri: Path to a single JSON file.
434
+ merge_keys: Fields to specify as merge keys for future 'zipper merge' operations on the dataset.
435
+ schema_mode: Currently ignored as this is for a single file.
436
+
437
+ Returns:
438
+ Dataset: New dataset instance with the schema automatically inferred
439
+ from the JSON file.
440
+ """
441
+ # TODO: integrate this with filesystem from deltacat catalog
442
+ file_uri, file_fs = FileStore.filesystem(file_uri, filesystem=filesystem)
443
+ if metadata_uri is None:
444
+ metadata_uri = posixpath.join(posixpath.dirname(file_uri), "riv-meta")
445
+ else:
446
+ metadata_uri, metadata_fs = FileStore.filesystem(
447
+ metadata_uri, filesystem=filesystem
448
+ )
449
+
450
+ # TODO: when integrating deltacat consider if we can support multiple filesystems
451
+ if file_fs.type_name != metadata_fs.type_name:
452
+ raise ValueError(
453
+ "File URI and metadata URI must be on the same filesystem."
454
+ )
455
+
456
+ # Read the JSON file into a PyArrow Table
457
+ pyarrow_table = pyarrow.json.read_json(file_uri, filesystem=file_fs)
458
+ pyarrow_schema = pyarrow_table.schema
459
+
460
+ # Create the dataset schema
461
+ dataset_schema = Schema.from_pyarrow(pyarrow_schema, merge_keys)
462
+
463
+ # Create the Dataset instance
464
+ dataset = cls(
465
+ dataset_name=name,
466
+ metadata_uri=metadata_uri,
467
+ schema=dataset_schema,
468
+ filesystem=file_fs,
469
+ namespace=namespace,
470
+ )
471
+
472
+ writer = dataset.writer()
473
+ writer.write(pyarrow_table.to_batches())
474
+ writer.flush()
475
+
476
+ return dataset
477
+
478
+ @classmethod
479
+ def from_csv(
480
+ cls,
481
+ name: str,
482
+ file_uri: str,
483
+ merge_keys: str | Iterable[str],
484
+ metadata_uri: Optional[str] = None,
485
+ schema_mode: str = "union",
486
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
487
+ namespace: str = DEFAULT_NAMESPACE,
488
+ ) -> "Dataset":
489
+ """
490
+ Create a Dataset from a single JSON file.
491
+
492
+ TODO: Add support for reading directories with multiple CSV files.
493
+
494
+ Args:
495
+ name: Unique identifier for the dataset.
496
+ metadata_uri: Base URI for the dataset, where dataset metadata is stored. If not specified, will be placed in ${file_uri}/riv-meta
497
+ file_uri: Path to a single CSV file.
498
+ merge_keys: Fields to specify as merge keys for future 'zipper merge' operations on the dataset.
499
+ schema_mode: Currently ignored as this is for a single file.
500
+
501
+ Returns:
502
+ Dataset: New dataset instance with the schema automatically inferred
503
+ from the CSV file.
504
+ """
505
+ # TODO: integrate this with filesystem from deltacat catalog
506
+ file_uri, file_fs = FileStore.filesystem(file_uri, filesystem=filesystem)
507
+ if metadata_uri is None:
508
+ metadata_uri = posixpath.join(posixpath.dirname(file_uri), "riv-meta")
509
+ else:
510
+ metadata_uri, metadata_fs = FileStore.filesystem(
511
+ metadata_uri, filesystem=filesystem
512
+ )
513
+
514
+ # TODO: when integrating deltacat consider if we can support multiple filesystems
515
+ if file_fs.type_name != metadata_fs.type_name:
516
+ raise ValueError(
517
+ "File URI and metadata URI must be on the same filesystem."
518
+ )
519
+
520
+ # Read the CSV file into a PyArrow Table
521
+ table = pyarrow.csv.read_csv(file_uri, filesystem=file_fs)
522
+ pyarrow_schema = table.schema
523
+
524
+ # Create the dataset schema
525
+ dataset_schema = Schema.from_pyarrow(pyarrow_schema, merge_keys)
526
+
527
+ # Create the Dataset instance
528
+ dataset = cls(
529
+ dataset_name=name,
530
+ metadata_uri=metadata_uri,
531
+ schema=dataset_schema,
532
+ filesystem=file_fs,
533
+ namespace=namespace,
534
+ )
535
+
536
+ writer = dataset.writer()
537
+ writer.write(table.to_batches())
538
+ writer.flush()
539
+
540
+ return dataset
541
+
542
+ def print(self, num_records: int = 10) -> None:
543
+ """Prints the first `num_records` records in the dataset."""
544
+ records = self.scan().to_pydict()
545
+ for record in itertools.islice(records, num_records):
546
+ print(record)
547
+
548
+ def export(
549
+ self,
550
+ file_uri: str,
551
+ format: str = "parquet",
552
+ query: QueryExpression = QueryExpression(),
553
+ ) -> None:
554
+ """Export the dataset to a file.
555
+
556
+ Args:
557
+ file_uri: The URI to write the dataset to.
558
+ format: The format to write the dataset in. Options are [parquet, feather].
559
+ """
560
+ export_dataset(self, file_uri, format, query)
561
+
562
+ def _add_fields_to_schema(
563
+ self,
564
+ field_names: Iterable[str],
565
+ schema_name: str,
566
+ ) -> None:
567
+ """
568
+ An internal function to add fields to a new or existing schema (creating the schema if it doesn't exist).
569
+ Note: This function will error if the fields do not exist (rather than add them).
570
+
571
+ Args:
572
+ field_names: List of field names to add to the schema.
573
+ schema_name: Name of the schema.
574
+
575
+ Raises:
576
+ ValueError: If any field does not exist in the dataset.
577
+ """
578
+
579
+ # Input Validation
580
+ # Ensure all fields exist
581
+ for name in field_names:
582
+ if name not in self.schemas[ALL]:
583
+ raise ValueError(f"Field '{name}' does not exist in the dataset.")
584
+
585
+ # Begin adding schema/fields to the schema map, this must be completed as a transaction w/o error or the schemas will be
586
+ # left in an undefined state.
587
+ # TODO: This is not threadsafe
588
+
589
+ # Create the empty schema if it doesn't exist
590
+ if schema_name not in self._schemas:
591
+ self._schemas[schema_name] = Schema()
592
+
593
+ # Add the (existing) fields from the 'all' schema to the defined schema
594
+ for name in field_names:
595
+ self._schemas[schema_name].add_field(self.schemas[ALL][name])
596
+
597
+ def add_fields(
598
+ self,
599
+ fields: Iterable[Tuple[str, Datatype] | Field],
600
+ schema_name: str = DEFAULT,
601
+ merge_keys: Optional[Iterable[str]] = None,
602
+ ) -> None:
603
+ """
604
+ Helper function to simultaneously add a set of new fields, put them under a new or existing schema,
605
+ and add merge keys, all in a single function.
606
+
607
+ This can also be done field by field using:
608
+ * dataset.fields.add(name=.., datatype=.., ...)
609
+
610
+ Or it can be done by using add_schema().
611
+
612
+ Args:
613
+ fields: List of tuples (name, datatype) or Field objects.
614
+ schema_name: User defined name to give to the group of fields.
615
+ merge_keys: Optional list of field names to set as merge keys.
616
+
617
+ Raises:
618
+ ValueError: If any field has the same name as an existing field.
619
+ """
620
+ if not fields:
621
+ raise ValueError("No fields provided.")
622
+ merge_keys = merge_keys or {}
623
+
624
+ # Convert all input tuples to Field objects
625
+ processed_fields = []
626
+ field_names = set()
627
+
628
+ for field in fields:
629
+ if isinstance(field, tuple):
630
+ name, datatype = field
631
+ processed_field = Field(
632
+ name=name, datatype=datatype, is_merge_key=(name in merge_keys)
633
+ )
634
+ elif isinstance(field, Field):
635
+ processed_field = field
636
+ name = field.name
637
+ # Check if merge key status on field conflicts with any provided status form merge_key list
638
+ if name in merge_keys:
639
+ if processed_field.is_merge_key is not True:
640
+ raise TypeError(
641
+ f"Merge key status conflict for field '{name}'. "
642
+ f"Field({name}).is_merge_key is set to 'false', but was '{name}' was provided in the merge_keys list. "
643
+ f"Remove {name} from merge_keys or change Field({name}).is_merge_key to true."
644
+ )
645
+ else:
646
+ raise TypeError(f"Unexpected field type: {type(field)}")
647
+
648
+ processed_fields.append(processed_field)
649
+ field_names.add(name)
650
+
651
+ # Input Validation
652
+ # Check that merge_keys defined are present in the fields being added
653
+ if merge_keys:
654
+ missing_keys = set(merge_keys) - field_names
655
+ if missing_keys:
656
+ raise ValueError(
657
+ f"The following merge keys were not found in the provided fields: {', '.join(missing_keys)}"
658
+ )
659
+
660
+ # Add/update the schema
661
+ self.add_schema(Schema(processed_fields), schema_name=schema_name)
662
+
663
+ def add_schema(self, schema: Schema, schema_name: str = DEFAULT) -> None:
664
+ """
665
+ Merges the provided schema into the existing schema, or creates a new schema if it doesn't exist.
666
+ Will also add all fields to the 'all' schema.
667
+
668
+ Args:
669
+ schema: The Schema to add or merge into the named dataset schema.
670
+ schema_name: The name of the schema to update or create. Defaults to "default".
671
+
672
+ Raises:
673
+ ValueError: If fields in the provided schema conflict with existing fields in the dataset.
674
+ """
675
+ schema_name = schema_name or DEFAULT
676
+
677
+ # Check for any fields that already exist
678
+ for field in schema.values():
679
+ if field.name in self.schemas[ALL]:
680
+ existing_field = self.schemas[ALL][field.name]
681
+ if existing_field is not None and field != existing_field:
682
+ raise ValueError(
683
+ f"Field '{field.name}' already exists and is of a different type: New({field}) Existing({existing_field})."
684
+ )
685
+
686
+ # Begin adding fields, this must be completed as a transaction w/o error or the field maps will be
687
+ # left in an undefined state.
688
+ # TODO: This is not threadsafe
689
+
690
+ # Create schema if it doesn't exist
691
+ if schema_name not in self._schemas:
692
+ self._schemas[schema_name] = Schema()
693
+
694
+ # Merge new schema into 'all' and provided schema_name
695
+ self._schemas[schema_name].merge(schema)
696
+ self._schemas[ALL].merge(schema)
697
+
698
+ def get_merge_keys(self) -> Iterable[str]:
699
+ """Return a list of all merge keys."""
700
+ return self.schemas[ALL].get_merge_keys()
701
+
702
+ def writer(
703
+ self,
704
+ schema_name: str = None,
705
+ file_format: str | None = None,
706
+ ) -> DatasetWriter:
707
+ """Create a new (stateful) writer using the schema at the conjunction of given schemas.
708
+
709
+ Invoking this will register any unregistered schemas.
710
+
711
+ :param schema_name: The schema to use for write, if None, uses the 'all' schema
712
+ :param file_format Write data to this format. Options are [parquet, feather]. If not specified, library will choose
713
+ based on schema
714
+ :return: new dataset writer with a schema at the conjunction of the given schemas
715
+ """
716
+ schema_name = schema_name or ALL
717
+
718
+ return MemtableDatasetWriter(
719
+ self._file_provider, self.schemas[schema_name], self._locator, file_format
720
+ )
721
+
722
+ def shards(
723
+ self,
724
+ num_shards: int,
725
+ strategy: str = "range",
726
+ ) -> Iterable[Shard]:
727
+ """Create a set of shards for this dataset.
728
+
729
+ :param num_shards: The number of shards to create.
730
+ :param strategy: Sharding strategy used to create shards..
731
+ :return Iterable[Shard]: A set of shards for this dataset.
732
+ """
733
+ return ShardingStrategy.from_string(strategy).shards(
734
+ num_shards, self._metastore
735
+ )
736
+
737
+ def scan(
738
+ self,
739
+ query: QueryExpression = QueryExpression(),
740
+ schema_name: str = ALL,
741
+ shard: Optional[Shard] = None,
742
+ ) -> DataScan:
743
+ dataset_reader = DatasetReader(self._metastore)
744
+ return DataScan(self.schemas[schema_name], query, dataset_reader, shard=shard)