deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.0b2.dist-info/METADATA +65 -0
- deltacat-2.0.0b2.dist-info/RECORD +349 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,744 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import itertools
|
5
|
+
import posixpath
|
6
|
+
from typing import Dict, List, Optional, Tuple, Iterable, Iterator
|
7
|
+
|
8
|
+
import pyarrow.fs
|
9
|
+
import pyarrow as pa
|
10
|
+
import pyarrow.dataset
|
11
|
+
import pyarrow.json
|
12
|
+
import pyarrow.csv
|
13
|
+
import pyarrow.parquet
|
14
|
+
|
15
|
+
from deltacat.constants import (
|
16
|
+
DEFAULT_NAMESPACE,
|
17
|
+
DEFAULT_PARTITION_ID,
|
18
|
+
DEFAULT_PARTITION_VALUES,
|
19
|
+
DEFAULT_STREAM_ID,
|
20
|
+
DEFAULT_TABLE_VERSION,
|
21
|
+
)
|
22
|
+
from deltacat.storage.model.partition import Partition, PartitionLocator
|
23
|
+
from deltacat.storage.model.shard import Shard, ShardingStrategy
|
24
|
+
from deltacat.storage.model.stream import Stream, StreamLocator
|
25
|
+
from deltacat.storage.model.transaction import TransactionOperationList
|
26
|
+
from deltacat.storage.model.types import CommitState, StreamFormat
|
27
|
+
from deltacat.storage.rivulet.fs.file_store import FileStore
|
28
|
+
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
29
|
+
from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
|
30
|
+
from deltacat.storage.rivulet import Schema, Field
|
31
|
+
from deltacat.utils.export import export_dataset
|
32
|
+
from .schema.schema import Datatype
|
33
|
+
|
34
|
+
from deltacat.storage.rivulet.reader.data_scan import DataScan
|
35
|
+
from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
|
36
|
+
from deltacat.storage.rivulet.reader.query_expression import QueryExpression
|
37
|
+
|
38
|
+
from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter
|
39
|
+
from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
|
40
|
+
MemtableDatasetWriter,
|
41
|
+
)
|
42
|
+
|
43
|
+
from deltacat.storage import (
|
44
|
+
Namespace,
|
45
|
+
NamespaceLocator,
|
46
|
+
Table,
|
47
|
+
TableLocator,
|
48
|
+
TableVersion,
|
49
|
+
TableVersionLocator,
|
50
|
+
Transaction,
|
51
|
+
TransactionType,
|
52
|
+
TransactionOperation,
|
53
|
+
TransactionOperationType,
|
54
|
+
)
|
55
|
+
from deltacat import logs
|
56
|
+
|
57
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
58
|
+
|
59
|
+
|
60
|
+
# These are the hardcoded default schema names
|
61
|
+
ALL = "all"
|
62
|
+
DEFAULT = "default"
|
63
|
+
|
64
|
+
|
65
|
+
class FieldsAccessor:
|
66
|
+
"""Accessor class used to make it easy to do actions like dataset.fields['name'] to work with fields in the Dataset.
|
67
|
+
All field mutation and access should come through this class, or through the public helper functions in the dataset
|
68
|
+
class, e.g. 'add_fields()'.
|
69
|
+
"""
|
70
|
+
|
71
|
+
def __init__(self, dataset: Dataset):
|
72
|
+
self.dataset = dataset
|
73
|
+
|
74
|
+
def __getitem__(self, field_name: str) -> Field:
|
75
|
+
if field_name not in self.dataset.schemas[ALL]:
|
76
|
+
raise KeyError(f"Field '{field_name}' not found in dataset.")
|
77
|
+
return self.dataset.schemas[ALL][field_name]
|
78
|
+
|
79
|
+
def __setitem__(self, field_name: str, field: Field):
|
80
|
+
if not isinstance(field, Field):
|
81
|
+
raise TypeError("Value must be a Field object")
|
82
|
+
self.dataset.schemas[ALL][field_name] = field
|
83
|
+
|
84
|
+
def __delitem__(self, field_name: str):
|
85
|
+
if field_name not in self.dataset.schemas[ALL]:
|
86
|
+
raise ValueError(f"Field '{field_name}' does not exist.")
|
87
|
+
del self.dataset.schemas[ALL][field_name]
|
88
|
+
for schema in self.dataset._schemas.values():
|
89
|
+
if field_name in schema:
|
90
|
+
del schema[field_name]
|
91
|
+
|
92
|
+
def __contains__(self, field_name: str) -> bool:
|
93
|
+
"""Allows 'field_name in dataset.fields' checks."""
|
94
|
+
return field_name in self.dataset.schemas[ALL]
|
95
|
+
|
96
|
+
def __iter__(self):
|
97
|
+
return iter(self.dataset.schemas[ALL].items())
|
98
|
+
|
99
|
+
def __len__(self):
|
100
|
+
return len(self.dataset.schemas[ALL])
|
101
|
+
|
102
|
+
def __repr__(self):
|
103
|
+
return f"Fields({list(self.dataset.schemas['all'].keys())})"
|
104
|
+
|
105
|
+
def add(
|
106
|
+
self,
|
107
|
+
name: str,
|
108
|
+
datatype: Datatype,
|
109
|
+
*,
|
110
|
+
schema_name: str = DEFAULT,
|
111
|
+
is_merge_key: bool = False,
|
112
|
+
):
|
113
|
+
"""Simple helper to add a field when you don't have a Field object"""
|
114
|
+
self.dataset.add_fields(
|
115
|
+
fields=[(name, datatype)],
|
116
|
+
schema_name=schema_name,
|
117
|
+
merge_keys=[name] if is_merge_key else None,
|
118
|
+
)
|
119
|
+
|
120
|
+
|
121
|
+
class SchemasAccessor:
|
122
|
+
"""Accessor class used to make it easy to do actions like dataset.schemas['all'] to work with schemas in the Dataset.
|
123
|
+
All schema mutation and access should come through this class, or through the public helper functions in the dataset
|
124
|
+
class, e.g. 'add_fields()'.
|
125
|
+
"""
|
126
|
+
|
127
|
+
def __init__(self, dataset: Dataset):
|
128
|
+
self.dataset = dataset
|
129
|
+
|
130
|
+
def __getitem__(self, name: str) -> Schema:
|
131
|
+
if name not in self.dataset._schemas:
|
132
|
+
raise KeyError(f"Schema '{name}' not found.")
|
133
|
+
return self.dataset._schemas[name]
|
134
|
+
|
135
|
+
def __setitem__(self, schema_name: str, field_names: List[str]) -> None:
|
136
|
+
self.dataset._add_fields_to_schema(
|
137
|
+
field_names=field_names, schema_name=schema_name
|
138
|
+
)
|
139
|
+
|
140
|
+
def __delitem__(self, schema_name: str) -> None:
|
141
|
+
if schema_name not in self.dataset._schemas:
|
142
|
+
raise ValueError(f"Schema '{schema_name}' does not exist.")
|
143
|
+
if schema_name == ALL:
|
144
|
+
raise ValueError("Cannot remove the 'all' schema.")
|
145
|
+
del self.dataset._schemas[schema_name]
|
146
|
+
|
147
|
+
def __contains__(self, schema_name: str) -> bool:
|
148
|
+
return schema_name in self.dataset._schemas
|
149
|
+
|
150
|
+
def __iter__(self) -> Iterator[str]:
|
151
|
+
return iter(self.dataset._schemas.keys())
|
152
|
+
|
153
|
+
def __len__(self) -> int:
|
154
|
+
return len(self.dataset._schemas)
|
155
|
+
|
156
|
+
def __repr__(self) -> str:
|
157
|
+
return f"SchemasAccessor({list(self.dataset._schemas.keys())})"
|
158
|
+
|
159
|
+
|
160
|
+
class Dataset:
|
161
|
+
def __init__(
|
162
|
+
self,
|
163
|
+
*,
|
164
|
+
dataset_name: str,
|
165
|
+
metadata_uri: Optional[str] = None,
|
166
|
+
schema: Optional[Schema] = None,
|
167
|
+
schema_name: Optional[str] = None,
|
168
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
169
|
+
namespace: Optional[str] = DEFAULT_NAMESPACE,
|
170
|
+
):
|
171
|
+
"""
|
172
|
+
Create an empty Dataset w/ optional schema. This method is typically only used for small datasets that are manually created.
|
173
|
+
Use the Dataset.from_*() to create a dataset from existing data.
|
174
|
+
|
175
|
+
Args:
|
176
|
+
dataset_name: Unique identifier for the dataset.
|
177
|
+
metadata_uri: The directory to store the _metadata_folder ('.riv-meta-{dataset_name}') containing dataset metadata.
|
178
|
+
If not provided, we'll use the local directory.
|
179
|
+
|
180
|
+
Private Attributes:
|
181
|
+
_metadata_folder (str):
|
182
|
+
The folder name where metadata for the dataset is kept. It will always be
|
183
|
+
'.riv-meta-{dataset_name}', and be stored under `metadata_uri`.
|
184
|
+
_schemas (dict[str, Schema]):
|
185
|
+
Maps a schemas by name (e.g., "default", "analytics"). This is how fields in the dataset are grouped and accessed.
|
186
|
+
_file_store (FileStore):
|
187
|
+
The FileStore used by the Dataset class for reading and writing metadata files.
|
188
|
+
_file_provider (FileProvider):
|
189
|
+
Used to resolve file URIs within the `_file_store`.
|
190
|
+
_metastore (DatasetMetastore):
|
191
|
+
Uses the _file_store and _file_provider to manage metadata (schema, stats, file locations, manifests, etc.) for this Dataset.
|
192
|
+
"""
|
193
|
+
if not dataset_name or not isinstance(dataset_name, str):
|
194
|
+
raise ValueError("Name must be a non-empty string")
|
195
|
+
|
196
|
+
self.dataset_name = dataset_name
|
197
|
+
self._schemas: Dict[str, Schema] = {ALL: Schema()}
|
198
|
+
|
199
|
+
self._metadata_folder = f".riv-meta-{dataset_name}"
|
200
|
+
path, filesystem = FileStore.filesystem(
|
201
|
+
metadata_uri or self._metadata_folder, filesystem
|
202
|
+
)
|
203
|
+
self._metadata_path = posixpath.join(path, self._metadata_folder)
|
204
|
+
|
205
|
+
self._table_name = dataset_name
|
206
|
+
self._table_version = DEFAULT_TABLE_VERSION
|
207
|
+
self._namespace = namespace
|
208
|
+
self._partition_id = DEFAULT_PARTITION_ID
|
209
|
+
|
210
|
+
self._create_metadata_directories()
|
211
|
+
|
212
|
+
# TODO: remove locator state here. The deltacat catalog and
|
213
|
+
# storage interface should remove the need to pass around locator state
|
214
|
+
self._locator = PartitionLocator.at(
|
215
|
+
namespace=self._namespace,
|
216
|
+
table_name=self.dataset_name,
|
217
|
+
table_version=self._table_version,
|
218
|
+
stream_id=DEFAULT_STREAM_ID,
|
219
|
+
stream_format=StreamFormat.DELTACAT,
|
220
|
+
partition_values=DEFAULT_PARTITION_VALUES,
|
221
|
+
partition_id=self._partition_id,
|
222
|
+
)
|
223
|
+
|
224
|
+
self._file_store = FileStore(self._metadata_path, filesystem)
|
225
|
+
self._file_provider = FileProvider(
|
226
|
+
self._metadata_path, self._locator, self._file_store
|
227
|
+
)
|
228
|
+
|
229
|
+
self._metastore = DatasetMetastore(
|
230
|
+
self._metadata_path, self._file_provider, self._locator
|
231
|
+
)
|
232
|
+
|
233
|
+
self.fields = FieldsAccessor(self)
|
234
|
+
self.schemas = SchemasAccessor(self)
|
235
|
+
|
236
|
+
if schema:
|
237
|
+
self.add_schema(schema, schema_name=schema_name)
|
238
|
+
|
239
|
+
def _create_metadata_directories(self) -> List[str]:
|
240
|
+
"""
|
241
|
+
Creates rivulet metadata files using deltacat transactions.
|
242
|
+
This is a temporary solution until deltacat storage is integrated.
|
243
|
+
|
244
|
+
{CATALOG_ROOT}/
|
245
|
+
├── {NAMESPACE_ID}/
|
246
|
+
│ ├── {TABLE_ID}/
|
247
|
+
│ │ ├── {TABLE_VERSION}/
|
248
|
+
│ │ │ ├── {STREAM}/
|
249
|
+
│ │ │ │ ├── {PARTITION}/
|
250
|
+
│ │ │ │ │ ├── {DELTA}/
|
251
|
+
│ │ │ │ │ │ ├── rev/
|
252
|
+
│ │ │ │ │ │ │ ├── 00000000000000000001_create_<txn_id>.mpk # Delta Metafile
|
253
|
+
│ │ │ │ │ └── ...
|
254
|
+
|
255
|
+
Currently, we assume **fixed** values for:
|
256
|
+
- Table Version → "table_version"
|
257
|
+
- Stream → "stream"
|
258
|
+
- Partition → "partition"
|
259
|
+
|
260
|
+
TODO this will be replaced with Deltacat Storage interface - https://github.com/ray-project/deltacat/issues/477
|
261
|
+
TODO: Consider how to support **dynamic values** for these entities.
|
262
|
+
"""
|
263
|
+
metafiles = [
|
264
|
+
Namespace.of(locator=NamespaceLocator.of(namespace=self._namespace)),
|
265
|
+
Table.of(
|
266
|
+
locator=TableLocator.at(self._namespace, self.dataset_name),
|
267
|
+
description=f"Table for {self.dataset_name}",
|
268
|
+
),
|
269
|
+
TableVersion.of(
|
270
|
+
locator=TableVersionLocator.at(
|
271
|
+
self._namespace, self.dataset_name, self._table_version
|
272
|
+
),
|
273
|
+
schema=None,
|
274
|
+
),
|
275
|
+
Stream.of(
|
276
|
+
locator=StreamLocator.at(
|
277
|
+
namespace=self._namespace,
|
278
|
+
table_name=self.dataset_name,
|
279
|
+
table_version=self._table_version,
|
280
|
+
stream_id=DEFAULT_STREAM_ID,
|
281
|
+
stream_format=StreamFormat.DELTACAT,
|
282
|
+
),
|
283
|
+
partition_scheme=None,
|
284
|
+
state=CommitState.STAGED,
|
285
|
+
previous_stream_id=None,
|
286
|
+
watermark=None,
|
287
|
+
),
|
288
|
+
Partition.of(
|
289
|
+
locator=PartitionLocator.at(
|
290
|
+
namespace=self._namespace,
|
291
|
+
table_name=self.dataset_name,
|
292
|
+
table_version=self._table_version,
|
293
|
+
stream_id=DEFAULT_STREAM_ID,
|
294
|
+
stream_format=StreamFormat.DELTACAT,
|
295
|
+
partition_values=DEFAULT_PARTITION_VALUES,
|
296
|
+
partition_id=self._partition_id,
|
297
|
+
),
|
298
|
+
schema=None,
|
299
|
+
content_types=None,
|
300
|
+
),
|
301
|
+
]
|
302
|
+
|
303
|
+
txn_operations = [
|
304
|
+
TransactionOperation.of(
|
305
|
+
operation_type=TransactionOperationType.CREATE, dest_metafile=meta
|
306
|
+
)
|
307
|
+
for meta in metafiles
|
308
|
+
]
|
309
|
+
|
310
|
+
transaction = Transaction.of(
|
311
|
+
txn_type=TransactionType.APPEND,
|
312
|
+
txn_operations=TransactionOperationList.of(txn_operations),
|
313
|
+
)
|
314
|
+
|
315
|
+
try:
|
316
|
+
paths = transaction.commit(self._metadata_path)[0]
|
317
|
+
return paths
|
318
|
+
except Exception as e:
|
319
|
+
# TODO: Have deltacat storage interface handle transaction errors.
|
320
|
+
error_message = str(e).lower()
|
321
|
+
if "already exists" in error_message:
|
322
|
+
logger.debug(f"Skipping creation: {e}")
|
323
|
+
return []
|
324
|
+
else:
|
325
|
+
raise
|
326
|
+
|
327
|
+
@classmethod
|
328
|
+
def from_parquet(
|
329
|
+
cls,
|
330
|
+
name: str,
|
331
|
+
file_uri: str,
|
332
|
+
merge_keys: str | Iterable[str],
|
333
|
+
metadata_uri: Optional[str] = None,
|
334
|
+
schema_mode: str = "union",
|
335
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
336
|
+
namespace: str = DEFAULT_NAMESPACE,
|
337
|
+
) -> Dataset:
|
338
|
+
"""
|
339
|
+
Create a Dataset from parquet files.
|
340
|
+
|
341
|
+
TODO: Make pluggable(from_x) with other file formats.
|
342
|
+
|
343
|
+
Args:
|
344
|
+
name: Unique identifier for the dataset.
|
345
|
+
metadata_uri: Base URI for the dataset, where dataset metadata is stored. If not specified, will be placed in ${file_uri}/riv-meta
|
346
|
+
file_uri: Path to parquet file(s)
|
347
|
+
merge_keys: Fields to specify as merge keys for future 'zipper merge' operations on the dataset
|
348
|
+
schema_mode: Schema combination mode. Options:
|
349
|
+
- 'union': Use unified schema with all columns
|
350
|
+
- 'intersect': Use only common columns across files
|
351
|
+
|
352
|
+
Returns:
|
353
|
+
Dataset: New dataset instance with the schema automatically inferred from the source parquet files
|
354
|
+
"""
|
355
|
+
# TODO: integrate this with filesystem from deltacat catalog
|
356
|
+
file_uri, file_fs = FileStore.filesystem(file_uri, filesystem=filesystem)
|
357
|
+
if metadata_uri is None:
|
358
|
+
metadata_uri = posixpath.join(posixpath.dirname(file_uri), "riv-meta")
|
359
|
+
else:
|
360
|
+
metadata_uri, metadata_fs = FileStore.filesystem(
|
361
|
+
metadata_uri, filesystem=filesystem
|
362
|
+
)
|
363
|
+
|
364
|
+
# TODO: when integrating deltacat consider if we can support multiple filesystems
|
365
|
+
if file_fs.type_name != metadata_fs.type_name:
|
366
|
+
raise ValueError(
|
367
|
+
"File URI and metadata URI must be on the same filesystem."
|
368
|
+
)
|
369
|
+
pyarrow_dataset = pyarrow.dataset.dataset(file_uri, filesystem=file_fs)
|
370
|
+
|
371
|
+
if schema_mode == "intersect":
|
372
|
+
schemas = []
|
373
|
+
for file in pyarrow_dataset.files:
|
374
|
+
with file_fs.open_input_file(file) as f:
|
375
|
+
schema = pyarrow.parquet.read_schema(f)
|
376
|
+
schemas.append(schema)
|
377
|
+
|
378
|
+
common_columns = set(schemas[0].names)
|
379
|
+
for schema in schemas[1:]:
|
380
|
+
common_columns.intersection_update(schema.names)
|
381
|
+
|
382
|
+
intersect_schema = pa.schema(
|
383
|
+
[(name, schemas[0].field(name).type) for name in common_columns]
|
384
|
+
)
|
385
|
+
pyarrow_schema = intersect_schema
|
386
|
+
else:
|
387
|
+
schemas = []
|
388
|
+
for file in pyarrow_dataset.files:
|
389
|
+
with file_fs.open_input_file(file) as f:
|
390
|
+
schema = pyarrow.parquet.read_schema(f)
|
391
|
+
schemas.append(schema)
|
392
|
+
pyarrow_schema = pa.unify_schemas(schemas)
|
393
|
+
|
394
|
+
dataset_schema = Schema.from_pyarrow(pyarrow_schema, merge_keys)
|
395
|
+
|
396
|
+
# TODO the file URI never gets stored/saved, do we need to do so?
|
397
|
+
dataset = cls(
|
398
|
+
dataset_name=name,
|
399
|
+
metadata_uri=metadata_uri,
|
400
|
+
schema=dataset_schema,
|
401
|
+
filesystem=file_fs,
|
402
|
+
namespace=namespace,
|
403
|
+
)
|
404
|
+
|
405
|
+
# TODO: avoid write! associate fields with their source data.
|
406
|
+
writer = dataset.writer()
|
407
|
+
|
408
|
+
for batch in pyarrow_dataset.scanner().to_batches():
|
409
|
+
writer.write(batch)
|
410
|
+
writer.flush()
|
411
|
+
|
412
|
+
return dataset
|
413
|
+
|
414
|
+
@classmethod
|
415
|
+
def from_json(
|
416
|
+
cls,
|
417
|
+
name: str,
|
418
|
+
file_uri: str,
|
419
|
+
merge_keys: str | Iterable[str],
|
420
|
+
metadata_uri: Optional[str] = None,
|
421
|
+
schema_mode: str = "union",
|
422
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
423
|
+
namespace: str = DEFAULT_NAMESPACE,
|
424
|
+
) -> "Dataset":
|
425
|
+
"""
|
426
|
+
Create a Dataset from a single JSON file.
|
427
|
+
|
428
|
+
TODO: Add support for reading directories with multiple JSON files.
|
429
|
+
|
430
|
+
Args:
|
431
|
+
name: Unique identifier for the dataset.
|
432
|
+
metadata_uri: Base URI for the dataset, where dataset metadata is stored. If not specified, will be placed in ${file_uri}/riv-meta
|
433
|
+
file_uri: Path to a single JSON file.
|
434
|
+
merge_keys: Fields to specify as merge keys for future 'zipper merge' operations on the dataset.
|
435
|
+
schema_mode: Currently ignored as this is for a single file.
|
436
|
+
|
437
|
+
Returns:
|
438
|
+
Dataset: New dataset instance with the schema automatically inferred
|
439
|
+
from the JSON file.
|
440
|
+
"""
|
441
|
+
# TODO: integrate this with filesystem from deltacat catalog
|
442
|
+
file_uri, file_fs = FileStore.filesystem(file_uri, filesystem=filesystem)
|
443
|
+
if metadata_uri is None:
|
444
|
+
metadata_uri = posixpath.join(posixpath.dirname(file_uri), "riv-meta")
|
445
|
+
else:
|
446
|
+
metadata_uri, metadata_fs = FileStore.filesystem(
|
447
|
+
metadata_uri, filesystem=filesystem
|
448
|
+
)
|
449
|
+
|
450
|
+
# TODO: when integrating deltacat consider if we can support multiple filesystems
|
451
|
+
if file_fs.type_name != metadata_fs.type_name:
|
452
|
+
raise ValueError(
|
453
|
+
"File URI and metadata URI must be on the same filesystem."
|
454
|
+
)
|
455
|
+
|
456
|
+
# Read the JSON file into a PyArrow Table
|
457
|
+
pyarrow_table = pyarrow.json.read_json(file_uri, filesystem=file_fs)
|
458
|
+
pyarrow_schema = pyarrow_table.schema
|
459
|
+
|
460
|
+
# Create the dataset schema
|
461
|
+
dataset_schema = Schema.from_pyarrow(pyarrow_schema, merge_keys)
|
462
|
+
|
463
|
+
# Create the Dataset instance
|
464
|
+
dataset = cls(
|
465
|
+
dataset_name=name,
|
466
|
+
metadata_uri=metadata_uri,
|
467
|
+
schema=dataset_schema,
|
468
|
+
filesystem=file_fs,
|
469
|
+
namespace=namespace,
|
470
|
+
)
|
471
|
+
|
472
|
+
writer = dataset.writer()
|
473
|
+
writer.write(pyarrow_table.to_batches())
|
474
|
+
writer.flush()
|
475
|
+
|
476
|
+
return dataset
|
477
|
+
|
478
|
+
@classmethod
|
479
|
+
def from_csv(
|
480
|
+
cls,
|
481
|
+
name: str,
|
482
|
+
file_uri: str,
|
483
|
+
merge_keys: str | Iterable[str],
|
484
|
+
metadata_uri: Optional[str] = None,
|
485
|
+
schema_mode: str = "union",
|
486
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
487
|
+
namespace: str = DEFAULT_NAMESPACE,
|
488
|
+
) -> "Dataset":
|
489
|
+
"""
|
490
|
+
Create a Dataset from a single JSON file.
|
491
|
+
|
492
|
+
TODO: Add support for reading directories with multiple CSV files.
|
493
|
+
|
494
|
+
Args:
|
495
|
+
name: Unique identifier for the dataset.
|
496
|
+
metadata_uri: Base URI for the dataset, where dataset metadata is stored. If not specified, will be placed in ${file_uri}/riv-meta
|
497
|
+
file_uri: Path to a single CSV file.
|
498
|
+
merge_keys: Fields to specify as merge keys for future 'zipper merge' operations on the dataset.
|
499
|
+
schema_mode: Currently ignored as this is for a single file.
|
500
|
+
|
501
|
+
Returns:
|
502
|
+
Dataset: New dataset instance with the schema automatically inferred
|
503
|
+
from the CSV file.
|
504
|
+
"""
|
505
|
+
# TODO: integrate this with filesystem from deltacat catalog
|
506
|
+
file_uri, file_fs = FileStore.filesystem(file_uri, filesystem=filesystem)
|
507
|
+
if metadata_uri is None:
|
508
|
+
metadata_uri = posixpath.join(posixpath.dirname(file_uri), "riv-meta")
|
509
|
+
else:
|
510
|
+
metadata_uri, metadata_fs = FileStore.filesystem(
|
511
|
+
metadata_uri, filesystem=filesystem
|
512
|
+
)
|
513
|
+
|
514
|
+
# TODO: when integrating deltacat consider if we can support multiple filesystems
|
515
|
+
if file_fs.type_name != metadata_fs.type_name:
|
516
|
+
raise ValueError(
|
517
|
+
"File URI and metadata URI must be on the same filesystem."
|
518
|
+
)
|
519
|
+
|
520
|
+
# Read the CSV file into a PyArrow Table
|
521
|
+
table = pyarrow.csv.read_csv(file_uri, filesystem=file_fs)
|
522
|
+
pyarrow_schema = table.schema
|
523
|
+
|
524
|
+
# Create the dataset schema
|
525
|
+
dataset_schema = Schema.from_pyarrow(pyarrow_schema, merge_keys)
|
526
|
+
|
527
|
+
# Create the Dataset instance
|
528
|
+
dataset = cls(
|
529
|
+
dataset_name=name,
|
530
|
+
metadata_uri=metadata_uri,
|
531
|
+
schema=dataset_schema,
|
532
|
+
filesystem=file_fs,
|
533
|
+
namespace=namespace,
|
534
|
+
)
|
535
|
+
|
536
|
+
writer = dataset.writer()
|
537
|
+
writer.write(table.to_batches())
|
538
|
+
writer.flush()
|
539
|
+
|
540
|
+
return dataset
|
541
|
+
|
542
|
+
def print(self, num_records: int = 10) -> None:
|
543
|
+
"""Prints the first `num_records` records in the dataset."""
|
544
|
+
records = self.scan().to_pydict()
|
545
|
+
for record in itertools.islice(records, num_records):
|
546
|
+
print(record)
|
547
|
+
|
548
|
+
def export(
|
549
|
+
self,
|
550
|
+
file_uri: str,
|
551
|
+
format: str = "parquet",
|
552
|
+
query: QueryExpression = QueryExpression(),
|
553
|
+
) -> None:
|
554
|
+
"""Export the dataset to a file.
|
555
|
+
|
556
|
+
Args:
|
557
|
+
file_uri: The URI to write the dataset to.
|
558
|
+
format: The format to write the dataset in. Options are [parquet, feather].
|
559
|
+
"""
|
560
|
+
export_dataset(self, file_uri, format, query)
|
561
|
+
|
562
|
+
def _add_fields_to_schema(
|
563
|
+
self,
|
564
|
+
field_names: Iterable[str],
|
565
|
+
schema_name: str,
|
566
|
+
) -> None:
|
567
|
+
"""
|
568
|
+
An internal function to add fields to a new or existing schema (creating the schema if it doesn't exist).
|
569
|
+
Note: This function will error if the fields do not exist (rather than add them).
|
570
|
+
|
571
|
+
Args:
|
572
|
+
field_names: List of field names to add to the schema.
|
573
|
+
schema_name: Name of the schema.
|
574
|
+
|
575
|
+
Raises:
|
576
|
+
ValueError: If any field does not exist in the dataset.
|
577
|
+
"""
|
578
|
+
|
579
|
+
# Input Validation
|
580
|
+
# Ensure all fields exist
|
581
|
+
for name in field_names:
|
582
|
+
if name not in self.schemas[ALL]:
|
583
|
+
raise ValueError(f"Field '{name}' does not exist in the dataset.")
|
584
|
+
|
585
|
+
# Begin adding schema/fields to the schema map, this must be completed as a transaction w/o error or the schemas will be
|
586
|
+
# left in an undefined state.
|
587
|
+
# TODO: This is not threadsafe
|
588
|
+
|
589
|
+
# Create the empty schema if it doesn't exist
|
590
|
+
if schema_name not in self._schemas:
|
591
|
+
self._schemas[schema_name] = Schema()
|
592
|
+
|
593
|
+
# Add the (existing) fields from the 'all' schema to the defined schema
|
594
|
+
for name in field_names:
|
595
|
+
self._schemas[schema_name].add_field(self.schemas[ALL][name])
|
596
|
+
|
597
|
+
def add_fields(
|
598
|
+
self,
|
599
|
+
fields: Iterable[Tuple[str, Datatype] | Field],
|
600
|
+
schema_name: str = DEFAULT,
|
601
|
+
merge_keys: Optional[Iterable[str]] = None,
|
602
|
+
) -> None:
|
603
|
+
"""
|
604
|
+
Helper function to simultaneously add a set of new fields, put them under a new or existing schema,
|
605
|
+
and add merge keys, all in a single function.
|
606
|
+
|
607
|
+
This can also be done field by field using:
|
608
|
+
* dataset.fields.add(name=.., datatype=.., ...)
|
609
|
+
|
610
|
+
Or it can be done by using add_schema().
|
611
|
+
|
612
|
+
Args:
|
613
|
+
fields: List of tuples (name, datatype) or Field objects.
|
614
|
+
schema_name: User defined name to give to the group of fields.
|
615
|
+
merge_keys: Optional list of field names to set as merge keys.
|
616
|
+
|
617
|
+
Raises:
|
618
|
+
ValueError: If any field has the same name as an existing field.
|
619
|
+
"""
|
620
|
+
if not fields:
|
621
|
+
raise ValueError("No fields provided.")
|
622
|
+
merge_keys = merge_keys or {}
|
623
|
+
|
624
|
+
# Convert all input tuples to Field objects
|
625
|
+
processed_fields = []
|
626
|
+
field_names = set()
|
627
|
+
|
628
|
+
for field in fields:
|
629
|
+
if isinstance(field, tuple):
|
630
|
+
name, datatype = field
|
631
|
+
processed_field = Field(
|
632
|
+
name=name, datatype=datatype, is_merge_key=(name in merge_keys)
|
633
|
+
)
|
634
|
+
elif isinstance(field, Field):
|
635
|
+
processed_field = field
|
636
|
+
name = field.name
|
637
|
+
# Check if merge key status on field conflicts with any provided status form merge_key list
|
638
|
+
if name in merge_keys:
|
639
|
+
if processed_field.is_merge_key is not True:
|
640
|
+
raise TypeError(
|
641
|
+
f"Merge key status conflict for field '{name}'. "
|
642
|
+
f"Field({name}).is_merge_key is set to 'false', but was '{name}' was provided in the merge_keys list. "
|
643
|
+
f"Remove {name} from merge_keys or change Field({name}).is_merge_key to true."
|
644
|
+
)
|
645
|
+
else:
|
646
|
+
raise TypeError(f"Unexpected field type: {type(field)}")
|
647
|
+
|
648
|
+
processed_fields.append(processed_field)
|
649
|
+
field_names.add(name)
|
650
|
+
|
651
|
+
# Input Validation
|
652
|
+
# Check that merge_keys defined are present in the fields being added
|
653
|
+
if merge_keys:
|
654
|
+
missing_keys = set(merge_keys) - field_names
|
655
|
+
if missing_keys:
|
656
|
+
raise ValueError(
|
657
|
+
f"The following merge keys were not found in the provided fields: {', '.join(missing_keys)}"
|
658
|
+
)
|
659
|
+
|
660
|
+
# Add/update the schema
|
661
|
+
self.add_schema(Schema(processed_fields), schema_name=schema_name)
|
662
|
+
|
663
|
+
def add_schema(self, schema: Schema, schema_name: str = DEFAULT) -> None:
|
664
|
+
"""
|
665
|
+
Merges the provided schema into the existing schema, or creates a new schema if it doesn't exist.
|
666
|
+
Will also add all fields to the 'all' schema.
|
667
|
+
|
668
|
+
Args:
|
669
|
+
schema: The Schema to add or merge into the named dataset schema.
|
670
|
+
schema_name: The name of the schema to update or create. Defaults to "default".
|
671
|
+
|
672
|
+
Raises:
|
673
|
+
ValueError: If fields in the provided schema conflict with existing fields in the dataset.
|
674
|
+
"""
|
675
|
+
schema_name = schema_name or DEFAULT
|
676
|
+
|
677
|
+
# Check for any fields that already exist
|
678
|
+
for field in schema.values():
|
679
|
+
if field.name in self.schemas[ALL]:
|
680
|
+
existing_field = self.schemas[ALL][field.name]
|
681
|
+
if existing_field is not None and field != existing_field:
|
682
|
+
raise ValueError(
|
683
|
+
f"Field '{field.name}' already exists and is of a different type: New({field}) Existing({existing_field})."
|
684
|
+
)
|
685
|
+
|
686
|
+
# Begin adding fields, this must be completed as a transaction w/o error or the field maps will be
|
687
|
+
# left in an undefined state.
|
688
|
+
# TODO: This is not threadsafe
|
689
|
+
|
690
|
+
# Create schema if it doesn't exist
|
691
|
+
if schema_name not in self._schemas:
|
692
|
+
self._schemas[schema_name] = Schema()
|
693
|
+
|
694
|
+
# Merge new schema into 'all' and provided schema_name
|
695
|
+
self._schemas[schema_name].merge(schema)
|
696
|
+
self._schemas[ALL].merge(schema)
|
697
|
+
|
698
|
+
def get_merge_keys(self) -> Iterable[str]:
|
699
|
+
"""Return a list of all merge keys."""
|
700
|
+
return self.schemas[ALL].get_merge_keys()
|
701
|
+
|
702
|
+
def writer(
|
703
|
+
self,
|
704
|
+
schema_name: str = None,
|
705
|
+
file_format: str | None = None,
|
706
|
+
) -> DatasetWriter:
|
707
|
+
"""Create a new (stateful) writer using the schema at the conjunction of given schemas.
|
708
|
+
|
709
|
+
Invoking this will register any unregistered schemas.
|
710
|
+
|
711
|
+
:param schema_name: The schema to use for write, if None, uses the 'all' schema
|
712
|
+
:param file_format Write data to this format. Options are [parquet, feather]. If not specified, library will choose
|
713
|
+
based on schema
|
714
|
+
:return: new dataset writer with a schema at the conjunction of the given schemas
|
715
|
+
"""
|
716
|
+
schema_name = schema_name or ALL
|
717
|
+
|
718
|
+
return MemtableDatasetWriter(
|
719
|
+
self._file_provider, self.schemas[schema_name], self._locator, file_format
|
720
|
+
)
|
721
|
+
|
722
|
+
def shards(
|
723
|
+
self,
|
724
|
+
num_shards: int,
|
725
|
+
strategy: str = "range",
|
726
|
+
) -> Iterable[Shard]:
|
727
|
+
"""Create a set of shards for this dataset.
|
728
|
+
|
729
|
+
:param num_shards: The number of shards to create.
|
730
|
+
:param strategy: Sharding strategy used to create shards..
|
731
|
+
:return Iterable[Shard]: A set of shards for this dataset.
|
732
|
+
"""
|
733
|
+
return ShardingStrategy.from_string(strategy).shards(
|
734
|
+
num_shards, self._metastore
|
735
|
+
)
|
736
|
+
|
737
|
+
def scan(
|
738
|
+
self,
|
739
|
+
query: QueryExpression = QueryExpression(),
|
740
|
+
schema_name: str = ALL,
|
741
|
+
shard: Optional[Shard] = None,
|
742
|
+
) -> DataScan:
|
743
|
+
dataset_reader = DatasetReader(self._metastore)
|
744
|
+
return DataScan(self.schemas[schema_name], query, dataset_reader, shard=shard)
|