deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.0b2.dist-info/METADATA +65 -0
- deltacat-2.0.0b2.dist-info/RECORD +349 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,6 +1,12 @@
|
|
1
|
+
import importlib
|
1
2
|
import logging
|
2
3
|
|
3
4
|
import deltacat.logs # noqa: F401
|
5
|
+
from deltacat.api import (
|
6
|
+
copy,
|
7
|
+
get,
|
8
|
+
put,
|
9
|
+
)
|
4
10
|
from deltacat.catalog.delegate import (
|
5
11
|
alter_namespace,
|
6
12
|
alter_table,
|
@@ -24,32 +30,51 @@ from deltacat.catalog.delegate import (
|
|
24
30
|
from deltacat.catalog.model.catalog import ( # noqa: F401
|
25
31
|
Catalog,
|
26
32
|
Catalogs,
|
27
|
-
|
33
|
+
is_initialized,
|
28
34
|
init,
|
35
|
+
get_catalog,
|
36
|
+
put_catalog,
|
29
37
|
)
|
30
38
|
from deltacat.catalog.model.table_definition import TableDefinition
|
31
39
|
from deltacat.storage import (
|
32
40
|
DistributedDataset,
|
41
|
+
Field,
|
33
42
|
LifecycleState,
|
34
43
|
ListResult,
|
35
44
|
LocalDataset,
|
36
45
|
LocalTable,
|
37
46
|
Namespace,
|
47
|
+
PartitionKey,
|
48
|
+
PartitionScheme,
|
49
|
+
Schema,
|
38
50
|
SchemaConsistencyType,
|
39
51
|
SortKey,
|
40
52
|
SortOrder,
|
53
|
+
SortScheme,
|
54
|
+
NullOrder,
|
41
55
|
)
|
56
|
+
from deltacat.storage.rivulet import Dataset, Datatype
|
42
57
|
from deltacat.types.media import ContentEncoding, ContentType, TableType
|
43
58
|
from deltacat.types.tables import TableWriteMode
|
44
59
|
|
60
|
+
__iceberg__ = []
|
61
|
+
if importlib.util.find_spec("pyiceberg") is not None:
|
62
|
+
from deltacat.catalog.iceberg import impl as IcebergCatalog
|
63
|
+
|
64
|
+
__iceberg__ = [
|
65
|
+
"IcebergCatalog",
|
66
|
+
]
|
67
|
+
|
45
68
|
deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
|
46
69
|
|
47
|
-
__version__ = "
|
70
|
+
__version__ = "2.0.0b2"
|
48
71
|
|
49
72
|
|
50
73
|
__all__ = [
|
51
74
|
"__version__",
|
52
|
-
"
|
75
|
+
"copy",
|
76
|
+
"get",
|
77
|
+
"put",
|
53
78
|
"alter_table",
|
54
79
|
"create_table",
|
55
80
|
"drop_table",
|
@@ -68,20 +93,34 @@ __all__ = [
|
|
68
93
|
"default_namespace",
|
69
94
|
"write_to_table",
|
70
95
|
"read_table",
|
96
|
+
"get_catalog",
|
97
|
+
"put_catalog",
|
98
|
+
"is_initialized",
|
71
99
|
"init",
|
72
100
|
"Catalog",
|
73
101
|
"ContentType",
|
74
102
|
"ContentEncoding",
|
75
103
|
"DistributedDataset",
|
104
|
+
"Dataset",
|
105
|
+
"Datatype",
|
106
|
+
"Field",
|
107
|
+
"IcebergCatalog",
|
76
108
|
"LifecycleState",
|
77
109
|
"ListResult",
|
78
110
|
"LocalDataset",
|
79
111
|
"LocalTable",
|
80
112
|
"Namespace",
|
113
|
+
"NullOrder",
|
114
|
+
"PartitionKey",
|
115
|
+
"PartitionScheme",
|
116
|
+
"Schema",
|
81
117
|
"SchemaConsistencyType",
|
82
118
|
"SortKey",
|
83
119
|
"SortOrder",
|
120
|
+
"SortScheme",
|
84
121
|
"TableDefinition",
|
85
122
|
"TableType",
|
86
123
|
"TableWriteMode",
|
87
124
|
]
|
125
|
+
|
126
|
+
__all__ += __iceberg__
|
deltacat/annotations.py
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
def ExperimentalAPI(obj):
|
2
|
+
"""Decorator for documenting experimental APIs.
|
3
|
+
|
4
|
+
Experimental APIs are classes and methods that are in development and may
|
5
|
+
change at any time in their development process. You should not expect
|
6
|
+
these APIs to be stable until their tag is changed to `DeveloperAPI` or
|
7
|
+
`PublicAPI`.
|
8
|
+
|
9
|
+
Subclasses that inherit from a ``@ExperimentalAPI`` base class can be
|
10
|
+
assumed experimental as well.
|
11
|
+
|
12
|
+
This decorator has no effect on runtime behavior
|
13
|
+
"""
|
14
|
+
return obj
|
15
|
+
|
16
|
+
|
17
|
+
def DeveloperAPI(obj):
|
18
|
+
"""Decorator for documenting experimental APIs.
|
19
|
+
|
20
|
+
Developer APIs are classes and methods explicitly exposed to developers
|
21
|
+
for low level integrations with DeltaCAT (e.g.: compute engines, other catalogs).
|
22
|
+
You can generally expect these APIs to be stable sans minor changes (but less stable than public APIs).
|
23
|
+
|
24
|
+
This decorator has no effect on runtime behavior
|
25
|
+
"""
|
26
|
+
return obj
|
27
|
+
|
28
|
+
|
29
|
+
def PublicAPI(obj):
|
30
|
+
"""Decorator for documenting public APIs.
|
31
|
+
|
32
|
+
Public APIs are classes and methods exposed to end users which are expected to remain stable across releases.
|
33
|
+
|
34
|
+
This decorator has no effect on runtime behavior
|
35
|
+
"""
|
36
|
+
return obj
|
deltacat/api.py
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
from typing import Any
|
2
|
+
|
3
|
+
|
4
|
+
import deltacat as dc
|
5
|
+
from deltacat.catalog import Catalog
|
6
|
+
|
7
|
+
|
8
|
+
def copy(source, destination):
|
9
|
+
src_parts = source.split("/")
|
10
|
+
src_parts = [part for part in src_parts if part]
|
11
|
+
dst_parts = destination.split("/")
|
12
|
+
dst_parts = [part for part in dst_parts if part]
|
13
|
+
if not dc.is_initialized():
|
14
|
+
raise ValueError("Catalog not initialized.")
|
15
|
+
if len(src_parts) != len(dst_parts) and len(src_parts) != len(dst_parts) + 1:
|
16
|
+
# TODO(pdames): Better error message.
|
17
|
+
raise ValueError(
|
18
|
+
f"Cannot copy {source} to {destination}. "
|
19
|
+
f"Source and destination must share the same type."
|
20
|
+
)
|
21
|
+
src_obj = get(source)
|
22
|
+
if len(src_parts) == 1:
|
23
|
+
# copy the given catalog
|
24
|
+
raise NotImplementedError
|
25
|
+
elif len(src_parts) == 2:
|
26
|
+
# TODO(pdames): Make catalog specification optional if there is only
|
27
|
+
# one catalog (e.g., auto-retrieve src_parts[0]/dst_parts[0])
|
28
|
+
# copy the given namespace
|
29
|
+
src_namespace_name = src_parts[1]
|
30
|
+
dst_catalog_name = dst_parts[0]
|
31
|
+
dst_namespace_name = dst_parts[1] if len(dst_parts) >= 2 else src_namespace_name
|
32
|
+
new_namespace = dc.create_namespace(
|
33
|
+
namespace=dst_namespace_name,
|
34
|
+
properties=src_obj.properties,
|
35
|
+
catalog=dst_catalog_name,
|
36
|
+
)
|
37
|
+
return new_namespace
|
38
|
+
elif len(src_parts) == 3:
|
39
|
+
# copy the given table
|
40
|
+
raise NotImplementedError
|
41
|
+
elif len(src_parts) == 4:
|
42
|
+
# copy the given table version
|
43
|
+
raise NotImplementedError
|
44
|
+
elif len(src_parts) == 5:
|
45
|
+
# copy the given stream
|
46
|
+
raise NotImplementedError
|
47
|
+
elif len(src_parts) == 6:
|
48
|
+
# copy the given partition
|
49
|
+
raise NotImplementedError
|
50
|
+
elif len(src_parts) == 7:
|
51
|
+
# copy the given partition delta
|
52
|
+
raise NotImplementedError
|
53
|
+
raise ValueError(f"Invalid path: {src_parts}")
|
54
|
+
|
55
|
+
|
56
|
+
def concat(source, destination):
|
57
|
+
raise NotImplementedError
|
58
|
+
|
59
|
+
|
60
|
+
def delete(source):
|
61
|
+
raise NotImplementedError
|
62
|
+
|
63
|
+
|
64
|
+
def move(source, destination):
|
65
|
+
raise NotImplementedError
|
66
|
+
|
67
|
+
|
68
|
+
def list(path):
|
69
|
+
raise NotImplementedError
|
70
|
+
|
71
|
+
|
72
|
+
def get(path) -> Any:
|
73
|
+
parts = path.split("/")
|
74
|
+
parts = [part for part in parts if part]
|
75
|
+
if not dc.is_initialized():
|
76
|
+
# TODO(pdames): Re-initialize DeltaCAT with all catalogs from the
|
77
|
+
# last session.
|
78
|
+
raise ValueError("Catalog not initialized.")
|
79
|
+
if len(parts) == 1:
|
80
|
+
# TODO(pdames): Save all catalogs registered from the last session on
|
81
|
+
# disk so that users don't need to re-initialize them every time.
|
82
|
+
# get the given catalog
|
83
|
+
catalog_name = parts[0]
|
84
|
+
return dc.get_catalog(catalog_name)
|
85
|
+
elif len(parts) == 2:
|
86
|
+
# get the given namespace
|
87
|
+
catalog_name = parts[0]
|
88
|
+
namespace_name = parts[1]
|
89
|
+
return dc.get_namespace(
|
90
|
+
namespace=namespace_name,
|
91
|
+
catalog=catalog_name,
|
92
|
+
)
|
93
|
+
elif len(parts) == 3:
|
94
|
+
# get the given table
|
95
|
+
raise NotImplementedError
|
96
|
+
elif len(parts) == 4:
|
97
|
+
# get the given table version
|
98
|
+
raise NotImplementedError
|
99
|
+
elif len(parts) == 5:
|
100
|
+
# get the given stream
|
101
|
+
raise NotImplementedError
|
102
|
+
elif len(parts) == 6:
|
103
|
+
# get the given partition
|
104
|
+
raise NotImplementedError
|
105
|
+
elif len(parts) == 7:
|
106
|
+
# get the given partition delta
|
107
|
+
raise NotImplementedError
|
108
|
+
raise ValueError(f"Invalid path: {path}")
|
109
|
+
|
110
|
+
|
111
|
+
def put(path, *args, **kwargs) -> Any:
|
112
|
+
parts = path.split("/")
|
113
|
+
parts = [part for part in parts if part]
|
114
|
+
if len(parts) == 1:
|
115
|
+
# TODO(pdames): Save all catalogs registered from the last session on
|
116
|
+
# disk so that users don't need to re-initialize them every time.
|
117
|
+
# register the given catalog
|
118
|
+
catalog_name = parts[0]
|
119
|
+
# Initialize default catalog using kwargs
|
120
|
+
catalog = Catalog(**kwargs)
|
121
|
+
return dc.put_catalog(catalog_name, catalog)
|
122
|
+
elif len(parts) == 2:
|
123
|
+
# register the given namespace
|
124
|
+
catalog_name = parts[0]
|
125
|
+
namespace_name = parts[1]
|
126
|
+
if not dc.is_initialized():
|
127
|
+
# TODO(pdames): Re-initialize DeltaCAT with all catalogs from the
|
128
|
+
# last session.
|
129
|
+
raise ValueError("Catalog not initialized.")
|
130
|
+
new_namespace = dc.create_namespace(
|
131
|
+
namespace=namespace_name,
|
132
|
+
catalog=catalog_name,
|
133
|
+
*args,
|
134
|
+
**kwargs,
|
135
|
+
)
|
136
|
+
return new_namespace
|
137
|
+
elif len(parts) == 3:
|
138
|
+
# register the given table
|
139
|
+
raise NotImplementedError
|
140
|
+
elif len(parts) == 4:
|
141
|
+
# register the given table version
|
142
|
+
raise NotImplementedError
|
143
|
+
elif len(parts) == 5:
|
144
|
+
# register the given stream
|
145
|
+
raise NotImplementedError
|
146
|
+
elif len(parts) == 6:
|
147
|
+
# register the given partition
|
148
|
+
raise NotImplementedError
|
149
|
+
elif len(parts) == 7:
|
150
|
+
# register the given partition delta
|
151
|
+
raise NotImplementedError
|
152
|
+
raise ValueError(f"Invalid path: {path}")
|
153
|
+
|
154
|
+
|
155
|
+
def exists(path):
|
156
|
+
raise NotImplementedError
|
157
|
+
|
158
|
+
|
159
|
+
def query(path, expression):
|
160
|
+
raise NotImplementedError
|
161
|
+
|
162
|
+
|
163
|
+
def tail(path):
|
164
|
+
raise NotImplementedError
|
165
|
+
|
166
|
+
|
167
|
+
def head(path):
|
168
|
+
raise NotImplementedError
|
deltacat/aws/s3u.py
CHANGED
@@ -14,7 +14,7 @@ from deltacat.aws.constants import (
|
|
14
14
|
DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY,
|
15
15
|
)
|
16
16
|
|
17
|
-
import pyarrow
|
17
|
+
import pyarrow.fs
|
18
18
|
import ray
|
19
19
|
import s3fs
|
20
20
|
from boto3.resources.base import ServiceResource
|
@@ -134,7 +134,7 @@ class UuidBlockWritePathProvider(FilenameProvider):
|
|
134
134
|
self,
|
135
135
|
base_path: str,
|
136
136
|
*,
|
137
|
-
filesystem: Optional[
|
137
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
138
138
|
dataset_uuid: Optional[str] = None,
|
139
139
|
block: Optional[ObjectRef[Block]] = None,
|
140
140
|
block_index: Optional[int] = None,
|
@@ -150,7 +150,7 @@ class UuidBlockWritePathProvider(FilenameProvider):
|
|
150
150
|
self,
|
151
151
|
base_path: str,
|
152
152
|
*,
|
153
|
-
filesystem: Optional[
|
153
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
154
154
|
dataset_uuid: Optional[str] = None,
|
155
155
|
block: Optional[ObjectRef[Block]] = None,
|
156
156
|
block_index: Optional[int] = None,
|
@@ -364,7 +364,7 @@ def upload_table(
|
|
364
364
|
**s3_client_kwargs,
|
365
365
|
) -> ManifestEntryList:
|
366
366
|
"""
|
367
|
-
Writes the given table to 1 or more S3 files and return
|
367
|
+
Writes the given table to 1 or more S3 files and return
|
368
368
|
manifest entries describing the uploaded files.
|
369
369
|
"""
|
370
370
|
if s3_table_writer_kwargs is None:
|
@@ -0,0 +1,82 @@
|
|
1
|
+
import sys
|
2
|
+
import time
|
3
|
+
from contextlib import contextmanager
|
4
|
+
from typing import Generator, Tuple
|
5
|
+
|
6
|
+
from deltacat.benchmarking.benchmark_report import BenchmarkMetric, BenchmarkStep
|
7
|
+
from deltacat.storage.rivulet.dataset import Dataset
|
8
|
+
from deltacat.storage.rivulet.reader.query_expression import QueryExpression
|
9
|
+
|
10
|
+
|
11
|
+
@contextmanager
|
12
|
+
def timed_step(description: str) -> Generator[BenchmarkStep, None, None]:
|
13
|
+
"""Convenience for computing elapsed time of a block of code as a metric.
|
14
|
+
|
15
|
+
:param description: description of the step
|
16
|
+
:return: a benchmark operation populated with the elapsed time
|
17
|
+
"""
|
18
|
+
metric = BenchmarkStep(description)
|
19
|
+
start_time = time.time()
|
20
|
+
yield metric
|
21
|
+
end_time = time.time()
|
22
|
+
metric.add(BenchmarkMetric("elapsed_time", 1000 * (end_time - start_time), "ms"))
|
23
|
+
|
24
|
+
|
25
|
+
class BenchmarkEngine:
|
26
|
+
def __init__(self, dataset: Dataset):
|
27
|
+
self.dataset = dataset
|
28
|
+
|
29
|
+
def load_and_commit(
|
30
|
+
self, schema_name, generator, count
|
31
|
+
) -> Tuple[str, BenchmarkStep]:
|
32
|
+
"""Load count number of rows from the generator and commit.
|
33
|
+
|
34
|
+
:param generator: row generator
|
35
|
+
:param count: the number of rows to load into the dataset
|
36
|
+
:return: tuple of the manifest URI and a operation measurement
|
37
|
+
"""
|
38
|
+
desc = f"load {count} from {generator}"
|
39
|
+
writer = self.dataset.writer(schema_name)
|
40
|
+
with timed_step(desc) as step:
|
41
|
+
rows = [generator.generate() for _ in range(count)]
|
42
|
+
writer.write(rows)
|
43
|
+
result = writer.flush()
|
44
|
+
step.add(BenchmarkMetric("loaded", count))
|
45
|
+
return result, step
|
46
|
+
|
47
|
+
def scan(self) -> Tuple[set[any], BenchmarkStep]:
|
48
|
+
"""
|
49
|
+
Scans the rows in dataset and prints some basic statistics about the manifest
|
50
|
+
|
51
|
+
:return: Tuple[set[any], BenchmarkStep] - a tuple containing a set of merge keys and a benchmark step with metrics
|
52
|
+
"""
|
53
|
+
keys = set()
|
54
|
+
object_count = 0
|
55
|
+
size_b = 0
|
56
|
+
# Note that we expect single col merge keys so we can return key set
|
57
|
+
# this will fail with validation error if dataset has multiple merge keys
|
58
|
+
merge_key_name = self.dataset.schemas["all"].get_merge_key()
|
59
|
+
with timed_step("full scan") as step:
|
60
|
+
for row in self.dataset.scan(QueryExpression()).to_pydict():
|
61
|
+
object_count += 1
|
62
|
+
size_b += sum([sys.getsizeof(x) for x in row.values()])
|
63
|
+
keys.add(row.get(merge_key_name))
|
64
|
+
# TODO replace with the actual metrics we want to measure
|
65
|
+
step.add(BenchmarkMetric("rows read", object_count))
|
66
|
+
step.add(BenchmarkMetric("size", size_b / (1024 * 1024), "MB"))
|
67
|
+
return keys, step
|
68
|
+
|
69
|
+
def run_queries(
|
70
|
+
self, description, manifest_uri, queries: list[QueryExpression]
|
71
|
+
) -> BenchmarkStep:
|
72
|
+
object_count = 0
|
73
|
+
size_b = 0
|
74
|
+
with timed_step(description) as step:
|
75
|
+
for query in queries:
|
76
|
+
for row in self.dataset.scan(query).to_pydict():
|
77
|
+
object_count += 1
|
78
|
+
size_b += sum([sys.getsizeof(x) for x in row.values()])
|
79
|
+
# TODO replace with the actual metrics we want to measure
|
80
|
+
step.add(BenchmarkMetric("rows read", object_count))
|
81
|
+
step.add(BenchmarkMetric("size", size_b / (1024 * 1024), "MB"))
|
82
|
+
return step
|
@@ -0,0 +1,86 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from tabulate import tabulate
|
3
|
+
from typing import Union, Optional
|
4
|
+
|
5
|
+
|
6
|
+
@dataclass
|
7
|
+
class BenchmarkMetric:
|
8
|
+
name: str
|
9
|
+
value: Union[float, int]
|
10
|
+
unit: Optional[str] = None
|
11
|
+
|
12
|
+
|
13
|
+
class BenchmarkStep:
|
14
|
+
"""Captures measurements from a given operation"""
|
15
|
+
|
16
|
+
def __init__(self, description):
|
17
|
+
self.description: str = description
|
18
|
+
"""Description of the operation"""
|
19
|
+
self._metrics: dict[str, BenchmarkMetric] = {}
|
20
|
+
"""Description of the operation"""
|
21
|
+
|
22
|
+
def add(self, metric: BenchmarkMetric):
|
23
|
+
self._metrics[metric.name] = metric
|
24
|
+
|
25
|
+
def list_metrics(self):
|
26
|
+
"""List the metrics (sorted by name)"""
|
27
|
+
return sorted(self._metrics.values(), key=lambda x: x.name)
|
28
|
+
|
29
|
+
|
30
|
+
class BenchmarkRun:
|
31
|
+
"""Class for capturing measurements for a given test suite for comparison."""
|
32
|
+
|
33
|
+
def __init__(self, suite: str, description: str):
|
34
|
+
self.suite = suite
|
35
|
+
"""The test suite associated with this report."""
|
36
|
+
self.description = description
|
37
|
+
"""Description of the report"""
|
38
|
+
self.steps: list[BenchmarkStep] = []
|
39
|
+
"""List of steps and their metrics"""
|
40
|
+
|
41
|
+
def add(self, operation):
|
42
|
+
self.steps.append(operation)
|
43
|
+
|
44
|
+
|
45
|
+
class BenchmarkReport:
|
46
|
+
def __init__(self, name):
|
47
|
+
self.name = name
|
48
|
+
self.runs: list[BenchmarkRun] = []
|
49
|
+
|
50
|
+
def add(self, run):
|
51
|
+
self.runs.append(run)
|
52
|
+
|
53
|
+
def __str__(self):
|
54
|
+
"""Pretty-print a table that compares the metrics across each report.
|
55
|
+
|
56
|
+
We want to transpose these such that each report gets their own column and each metric gets its own row
|
57
|
+
(ideally grouped by operation).
|
58
|
+
"""
|
59
|
+
if not self.runs:
|
60
|
+
print("No runs to compare!")
|
61
|
+
return
|
62
|
+
suites = set(r.suite for r in self.runs)
|
63
|
+
if len(suites) > 1:
|
64
|
+
print("Found more than one type of suite")
|
65
|
+
return
|
66
|
+
suite = self.runs[0].suite
|
67
|
+
|
68
|
+
headers = [
|
69
|
+
f"{suite} Operation",
|
70
|
+
"Metric",
|
71
|
+
"Unit",
|
72
|
+
*[r.description for r in self.runs],
|
73
|
+
]
|
74
|
+
rows = []
|
75
|
+
for step_tranche in zip(*[r.steps for r in self.runs]):
|
76
|
+
# TODO zip by metric name instead of assuming all metrics are being measured
|
77
|
+
step_name = step_tranche[0].description
|
78
|
+
for metric_tuple in zip(*[x.list_metrics() for x in step_tranche]):
|
79
|
+
row = [
|
80
|
+
step_name,
|
81
|
+
metric_tuple[0].name,
|
82
|
+
metric_tuple[0].unit,
|
83
|
+
*[p.value for p in metric_tuple],
|
84
|
+
]
|
85
|
+
rows.append(row)
|
86
|
+
return tabulate(rows, headers=headers, tablefmt="fancy_outline")
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from typing import Protocol
|
2
|
+
|
3
|
+
from deltacat.benchmarking.benchmark_report import BenchmarkRun
|
4
|
+
|
5
|
+
|
6
|
+
class BenchmarkSuite(Protocol):
|
7
|
+
def run(self) -> BenchmarkRun:
|
8
|
+
"""Run the benchmark suite and produce a report.
|
9
|
+
|
10
|
+
Each report should be comparable against other reports by the same suite"""
|
11
|
+
...
|
@@ -4,7 +4,9 @@ import pyarrow as pa
|
|
4
4
|
import pyarrow.fs as pafs
|
5
5
|
import pyarrow.parquet as papq
|
6
6
|
import pytest
|
7
|
+
from _pytest.terminal import TerminalReporter
|
7
8
|
|
9
|
+
from deltacat.benchmarking.benchmark_report import BenchmarkReport
|
8
10
|
from deltacat.utils.pyarrow import s3_file_to_table
|
9
11
|
from deltacat.types.media import (
|
10
12
|
ContentEncoding,
|
@@ -12,6 +14,25 @@ from deltacat.types.media import (
|
|
12
14
|
)
|
13
15
|
|
14
16
|
|
17
|
+
@pytest.fixture(autouse=True, scope="function")
|
18
|
+
def report(request):
|
19
|
+
report = BenchmarkReport(request.node.name)
|
20
|
+
|
21
|
+
def final_callback():
|
22
|
+
terminal_reporter: TerminalReporter = request.config.pluginmanager.get_plugin(
|
23
|
+
"terminalreporter"
|
24
|
+
)
|
25
|
+
capture_manager = request.config.pluginmanager.get_plugin("capturemanager")
|
26
|
+
with capture_manager.global_and_fixture_disabled():
|
27
|
+
terminal_reporter.ensure_newline()
|
28
|
+
terminal_reporter.section(request.node.name, sep="-", blue=True, bold=True)
|
29
|
+
terminal_reporter.write(str(report))
|
30
|
+
terminal_reporter.ensure_newline()
|
31
|
+
|
32
|
+
request.addfinalizer(final_callback)
|
33
|
+
return report
|
34
|
+
|
35
|
+
|
15
36
|
def pyarrow_read(path: str, columns: list[str] | None = None) -> pa.Table:
|
16
37
|
assert path.startswith(
|
17
38
|
"s3://"
|
@@ -0,0 +1,94 @@
|
|
1
|
+
import math
|
2
|
+
import os
|
3
|
+
from enum import Enum
|
4
|
+
from io import BytesIO
|
5
|
+
from typing import Any, Dict
|
6
|
+
|
7
|
+
import faker
|
8
|
+
from faker_file.providers.png_file import GraphicPngFileProvider
|
9
|
+
from faker_file.storages.filesystem import FileSystemStorage
|
10
|
+
from PIL import Image
|
11
|
+
|
12
|
+
from deltacat.benchmarking.data.row_generator import RowGenerator
|
13
|
+
|
14
|
+
|
15
|
+
class ImageStyle(Enum):
|
16
|
+
RANDOM_BYTES = 1
|
17
|
+
"""Generate random bytes to simulate an image.
|
18
|
+
|
19
|
+
This is the fastest option (if you want to test correctness).
|
20
|
+
"""
|
21
|
+
PILLOW = 2
|
22
|
+
"""Generate actual PNG files in-memory directly using Pillow"""
|
23
|
+
FAKER_FILE = 3
|
24
|
+
"""Generate PNG files on-disk using with some random elements"""
|
25
|
+
|
26
|
+
|
27
|
+
class RandomRowGenerator(RowGenerator):
|
28
|
+
"""Generate rows with 'images' that are just randomly-generated bytes"""
|
29
|
+
|
30
|
+
def __init__(
|
31
|
+
self, seed=0, tmp_dir=None, style: ImageStyle = ImageStyle.RANDOM_BYTES
|
32
|
+
):
|
33
|
+
self.seed = seed
|
34
|
+
self.fake = faker.Faker()
|
35
|
+
self.fake.seed_instance(seed)
|
36
|
+
self.fake.add_provider(GraphicPngFileProvider)
|
37
|
+
self.temp_dir = tmp_dir
|
38
|
+
self.style = style
|
39
|
+
|
40
|
+
def __str__(self):
|
41
|
+
return f"random source"
|
42
|
+
|
43
|
+
def _generate_image(self, width, height) -> bytes:
|
44
|
+
if self.style == ImageStyle.RANDOM_BYTES:
|
45
|
+
return self._generate_with_random_bytes(width, height)
|
46
|
+
elif self.style == ImageStyle.PILLOW:
|
47
|
+
return self._generate_with_pillow(width, height)
|
48
|
+
elif self.style == ImageStyle.FAKER_FILE:
|
49
|
+
return self._generate_with_faker(width, height)
|
50
|
+
else:
|
51
|
+
raise ValueError("Unknown ImageStyle")
|
52
|
+
|
53
|
+
@staticmethod
|
54
|
+
def _generate_with_random_bytes(width, height) -> bytes:
|
55
|
+
"""Generate random bytes to simulate an image."""
|
56
|
+
target_size = math.floor(
|
57
|
+
width * height / 50
|
58
|
+
) # this isn't actually how file size relates to image size
|
59
|
+
# Assumption: we don't actually need images. It suffices to generate arbitrary-length bytes of random characters.
|
60
|
+
return os.urandom(target_size)
|
61
|
+
|
62
|
+
@staticmethod
|
63
|
+
def _generate_with_pillow(width, height) -> bytes:
|
64
|
+
"""Generate actual PNG files in-memory directly using Pillow"""
|
65
|
+
file = BytesIO()
|
66
|
+
image = Image.new("RGBA", size=(width, height), color=(155, 0, 0))
|
67
|
+
image.save(file, "png")
|
68
|
+
file.name = "test.png"
|
69
|
+
file.seek(0)
|
70
|
+
return file.read()
|
71
|
+
|
72
|
+
def _generate_with_faker(self, width, height) -> bytes:
|
73
|
+
"""Generate PNG files on-disk using with some random elements"""
|
74
|
+
rel_name = self.fake.graphic_png_file(
|
75
|
+
storage=FileSystemStorage(
|
76
|
+
root_path=self.temp_dir,
|
77
|
+
rel_path="tmp",
|
78
|
+
),
|
79
|
+
size=(width, height),
|
80
|
+
)
|
81
|
+
file_name = f"{self.temp_dir}/{rel_name}"
|
82
|
+
with open(file_name, "rb") as f:
|
83
|
+
return f.read()
|
84
|
+
|
85
|
+
def generate(self) -> Dict[str, Any]:
|
86
|
+
return {
|
87
|
+
"id": self.fake.random_int(0, 10_000_000),
|
88
|
+
"source": self.fake.image_url(),
|
89
|
+
"media": (
|
90
|
+
self._generate_image(
|
91
|
+
self.fake.random_int(512, 2048), self.fake.random_int(512, 4096)
|
92
|
+
)
|
93
|
+
),
|
94
|
+
}
|
@@ -0,0 +1,10 @@
|
|
1
|
+
from typing import Protocol, Iterator, Dict, Any
|
2
|
+
|
3
|
+
|
4
|
+
class RowGenerator(Protocol):
|
5
|
+
def generate(self) -> Dict[str, Any]:
|
6
|
+
...
|
7
|
+
|
8
|
+
def generate_dataset(self, count) -> Iterator[Dict[str, Any]]:
|
9
|
+
"""Generate a dataset with a given number of records"""
|
10
|
+
return map(lambda x: self.generate(), iter(range(count)))
|