deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +2 -3
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -1
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
  40. deltacat/compute/compactor_v2/steps/merge.py +11 -80
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  45. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  46. deltacat/compute/converter/constants.py +4 -0
  47. deltacat/compute/converter/converter_session.py +143 -0
  48. deltacat/compute/converter/model/convert_input.py +69 -0
  49. deltacat/compute/converter/model/convert_input_files.py +61 -0
  50. deltacat/compute/converter/model/converter_session_params.py +99 -0
  51. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  52. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  53. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  54. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  55. deltacat/compute/converter/steps/__init__.py +0 -0
  56. deltacat/compute/converter/steps/convert.py +211 -0
  57. deltacat/compute/converter/steps/dedupe.py +60 -0
  58. deltacat/compute/converter/utils/__init__.py +0 -0
  59. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  60. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  61. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  62. deltacat/compute/converter/utils/io.py +43 -0
  63. deltacat/compute/converter/utils/s3u.py +133 -0
  64. deltacat/compute/resource_estimation/delta.py +1 -19
  65. deltacat/constants.py +47 -1
  66. deltacat/env.py +51 -0
  67. deltacat/examples/__init__.py +0 -0
  68. deltacat/examples/basic_logging.py +101 -0
  69. deltacat/examples/common/__init__.py +0 -0
  70. deltacat/examples/common/fixtures.py +15 -0
  71. deltacat/examples/hello_world.py +27 -0
  72. deltacat/examples/iceberg/__init__.py +0 -0
  73. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  74. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  75. deltacat/exceptions.py +51 -9
  76. deltacat/logs.py +4 -1
  77. deltacat/storage/__init__.py +118 -28
  78. deltacat/storage/iceberg/__init__.py +0 -0
  79. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  80. deltacat/storage/iceberg/impl.py +737 -0
  81. deltacat/storage/iceberg/model.py +709 -0
  82. deltacat/storage/interface.py +217 -134
  83. deltacat/storage/main/__init__.py +0 -0
  84. deltacat/storage/main/impl.py +2077 -0
  85. deltacat/storage/model/delta.py +118 -71
  86. deltacat/storage/model/interop.py +24 -0
  87. deltacat/storage/model/list_result.py +8 -0
  88. deltacat/storage/model/locator.py +93 -3
  89. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  90. deltacat/storage/model/metafile.py +1316 -0
  91. deltacat/storage/model/namespace.py +34 -18
  92. deltacat/storage/model/partition.py +362 -37
  93. deltacat/storage/model/scan/__init__.py +0 -0
  94. deltacat/storage/model/scan/push_down.py +19 -0
  95. deltacat/storage/model/scan/scan_plan.py +10 -0
  96. deltacat/storage/model/scan/scan_task.py +34 -0
  97. deltacat/storage/model/schema.py +892 -0
  98. deltacat/storage/model/shard.py +47 -0
  99. deltacat/storage/model/sort_key.py +170 -13
  100. deltacat/storage/model/stream.py +208 -80
  101. deltacat/storage/model/table.py +123 -29
  102. deltacat/storage/model/table_version.py +322 -46
  103. deltacat/storage/model/transaction.py +757 -0
  104. deltacat/storage/model/transform.py +198 -61
  105. deltacat/storage/model/types.py +111 -13
  106. deltacat/storage/rivulet/__init__.py +11 -0
  107. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  108. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  109. deltacat/storage/rivulet/dataset.py +744 -0
  110. deltacat/storage/rivulet/dataset_executor.py +87 -0
  111. deltacat/storage/rivulet/feather/__init__.py +5 -0
  112. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  113. deltacat/storage/rivulet/feather/serializer.py +35 -0
  114. deltacat/storage/rivulet/fs/__init__.py +0 -0
  115. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  116. deltacat/storage/rivulet/fs/file_store.py +130 -0
  117. deltacat/storage/rivulet/fs/input_file.py +76 -0
  118. deltacat/storage/rivulet/fs/output_file.py +86 -0
  119. deltacat/storage/rivulet/logical_plan.py +105 -0
  120. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  121. deltacat/storage/rivulet/metastore/delta.py +190 -0
  122. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  123. deltacat/storage/rivulet/metastore/sst.py +82 -0
  124. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  125. deltacat/storage/rivulet/mvp/Table.py +101 -0
  126. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  127. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  129. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  130. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  131. deltacat/storage/rivulet/reader/__init__.py +0 -0
  132. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  133. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  134. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  135. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  136. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  137. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  138. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  139. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  140. deltacat/storage/rivulet/schema/__init__.py +0 -0
  141. deltacat/storage/rivulet/schema/datatype.py +128 -0
  142. deltacat/storage/rivulet/schema/schema.py +251 -0
  143. deltacat/storage/rivulet/serializer.py +40 -0
  144. deltacat/storage/rivulet/serializer_factory.py +42 -0
  145. deltacat/storage/rivulet/writer/__init__.py +0 -0
  146. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  147. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  148. deltacat/tests/_io/__init__.py +1 -0
  149. deltacat/tests/catalog/test_catalogs.py +324 -0
  150. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  151. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  152. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  153. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  154. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  155. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  156. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  157. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  158. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  159. deltacat/tests/compute/conftest.py +75 -0
  160. deltacat/tests/compute/converter/__init__.py +0 -0
  161. deltacat/tests/compute/converter/conftest.py +80 -0
  162. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  163. deltacat/tests/compute/converter/utils.py +123 -0
  164. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  165. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  166. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  167. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  168. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  169. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  170. deltacat/tests/compute/test_util_common.py +19 -12
  171. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  172. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  173. deltacat/tests/storage/__init__.py +0 -0
  174. deltacat/tests/storage/conftest.py +25 -0
  175. deltacat/tests/storage/main/__init__.py +0 -0
  176. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  177. deltacat/tests/storage/model/__init__.py +0 -0
  178. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  179. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  180. deltacat/tests/storage/model/test_schema.py +308 -0
  181. deltacat/tests/storage/model/test_shard.py +22 -0
  182. deltacat/tests/storage/model/test_table_version.py +110 -0
  183. deltacat/tests/storage/model/test_transaction.py +308 -0
  184. deltacat/tests/storage/rivulet/__init__.py +0 -0
  185. deltacat/tests/storage/rivulet/conftest.py +149 -0
  186. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  187. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  188. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  189. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  190. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  191. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  192. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  193. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  194. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  195. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  197. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  198. deltacat/tests/test_deltacat_api.py +39 -0
  199. deltacat/tests/test_utils/filesystem.py +14 -0
  200. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  201. deltacat/tests/test_utils/pyarrow.py +8 -15
  202. deltacat/tests/test_utils/storage.py +266 -3
  203. deltacat/tests/utils/test_daft.py +3 -3
  204. deltacat/tests/utils/test_pyarrow.py +0 -432
  205. deltacat/types/partial_download.py +1 -1
  206. deltacat/types/tables.py +1 -1
  207. deltacat/utils/export.py +59 -0
  208. deltacat/utils/filesystem.py +320 -0
  209. deltacat/utils/metafile_locator.py +73 -0
  210. deltacat/utils/pyarrow.py +36 -183
  211. deltacat-2.0.dist-info/METADATA +65 -0
  212. deltacat-2.0.dist-info/RECORD +347 -0
  213. deltacat/aws/redshift/__init__.py +0 -19
  214. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  215. deltacat/io/dataset.py +0 -73
  216. deltacat/io/read_api.py +0 -143
  217. deltacat/storage/model/delete_parameters.py +0 -40
  218. deltacat/storage/model/partition_spec.py +0 -71
  219. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  220. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  221. deltacat-1.1.35.dist-info/METADATA +0 -64
  222. deltacat-1.1.35.dist-info/RECORD +0 -219
  223. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  224. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  225. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  226. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  227. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  228. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  229. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  233. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  234. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  235. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -1,6 +1,12 @@
1
+ import importlib
1
2
  import logging
2
3
 
3
4
  import deltacat.logs # noqa: F401
5
+ from deltacat.api import (
6
+ copy,
7
+ get,
8
+ put,
9
+ )
4
10
  from deltacat.catalog.delegate import (
5
11
  alter_namespace,
6
12
  alter_table,
@@ -24,32 +30,51 @@ from deltacat.catalog.delegate import (
24
30
  from deltacat.catalog.model.catalog import ( # noqa: F401
25
31
  Catalog,
26
32
  Catalogs,
27
- all_catalogs,
33
+ is_initialized,
28
34
  init,
35
+ get_catalog,
36
+ put_catalog,
29
37
  )
30
38
  from deltacat.catalog.model.table_definition import TableDefinition
31
39
  from deltacat.storage import (
32
40
  DistributedDataset,
41
+ Field,
33
42
  LifecycleState,
34
43
  ListResult,
35
44
  LocalDataset,
36
45
  LocalTable,
37
46
  Namespace,
47
+ PartitionKey,
48
+ PartitionScheme,
49
+ Schema,
38
50
  SchemaConsistencyType,
39
51
  SortKey,
40
52
  SortOrder,
53
+ SortScheme,
54
+ NullOrder,
41
55
  )
56
+ from deltacat.storage.rivulet import Dataset, Datatype
42
57
  from deltacat.types.media import ContentEncoding, ContentType, TableType
43
58
  from deltacat.types.tables import TableWriteMode
44
59
 
60
+ __iceberg__ = []
61
+ if importlib.util.find_spec("pyiceberg") is not None:
62
+ from deltacat.catalog.iceberg import impl as IcebergCatalog
63
+
64
+ __iceberg__ = [
65
+ "IcebergCatalog",
66
+ ]
67
+
45
68
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
69
 
47
- __version__ = "1.1.35"
70
+ __version__ = "2.0"
48
71
 
49
72
 
50
73
  __all__ = [
51
74
  "__version__",
52
- "all_catalogs",
75
+ "copy",
76
+ "get",
77
+ "put",
53
78
  "alter_table",
54
79
  "create_table",
55
80
  "drop_table",
@@ -68,20 +93,34 @@ __all__ = [
68
93
  "default_namespace",
69
94
  "write_to_table",
70
95
  "read_table",
96
+ "get_catalog",
97
+ "put_catalog",
98
+ "is_initialized",
71
99
  "init",
72
100
  "Catalog",
73
101
  "ContentType",
74
102
  "ContentEncoding",
75
103
  "DistributedDataset",
104
+ "Dataset",
105
+ "Datatype",
106
+ "Field",
107
+ "IcebergCatalog",
76
108
  "LifecycleState",
77
109
  "ListResult",
78
110
  "LocalDataset",
79
111
  "LocalTable",
80
112
  "Namespace",
113
+ "NullOrder",
114
+ "PartitionKey",
115
+ "PartitionScheme",
116
+ "Schema",
81
117
  "SchemaConsistencyType",
82
118
  "SortKey",
83
119
  "SortOrder",
120
+ "SortScheme",
84
121
  "TableDefinition",
85
122
  "TableType",
86
123
  "TableWriteMode",
87
124
  ]
125
+
126
+ __all__ += __iceberg__
@@ -0,0 +1,36 @@
1
+ def ExperimentalAPI(obj):
2
+ """Decorator for documenting experimental APIs.
3
+
4
+ Experimental APIs are classes and methods that are in development and may
5
+ change at any time in their development process. You should not expect
6
+ these APIs to be stable until their tag is changed to `DeveloperAPI` or
7
+ `PublicAPI`.
8
+
9
+ Subclasses that inherit from a ``@ExperimentalAPI`` base class can be
10
+ assumed experimental as well.
11
+
12
+ This decorator has no effect on runtime behavior
13
+ """
14
+ return obj
15
+
16
+
17
+ def DeveloperAPI(obj):
18
+ """Decorator for documenting experimental APIs.
19
+
20
+ Developer APIs are classes and methods explicitly exposed to developers
21
+ for low level integrations with DeltaCAT (e.g.: compute engines, other catalogs).
22
+ You can generally expect these APIs to be stable sans minor changes (but less stable than public APIs).
23
+
24
+ This decorator has no effect on runtime behavior
25
+ """
26
+ return obj
27
+
28
+
29
+ def PublicAPI(obj):
30
+ """Decorator for documenting public APIs.
31
+
32
+ Public APIs are classes and methods exposed to end users which are expected to remain stable across releases.
33
+
34
+ This decorator has no effect on runtime behavior
35
+ """
36
+ return obj
deltacat/api.py ADDED
@@ -0,0 +1,168 @@
1
+ from typing import Any
2
+
3
+
4
+ import deltacat as dc
5
+ from deltacat.catalog import Catalog
6
+
7
+
8
+ def copy(source, destination):
9
+ src_parts = source.split("/")
10
+ src_parts = [part for part in src_parts if part]
11
+ dst_parts = destination.split("/")
12
+ dst_parts = [part for part in dst_parts if part]
13
+ if not dc.is_initialized():
14
+ raise ValueError("Catalog not initialized.")
15
+ if len(src_parts) != len(dst_parts) and len(src_parts) != len(dst_parts) + 1:
16
+ # TODO(pdames): Better error message.
17
+ raise ValueError(
18
+ f"Cannot copy {source} to {destination}. "
19
+ f"Source and destination must share the same type."
20
+ )
21
+ src_obj = get(source)
22
+ if len(src_parts) == 1:
23
+ # copy the given catalog
24
+ raise NotImplementedError
25
+ elif len(src_parts) == 2:
26
+ # TODO(pdames): Make catalog specification optional if there is only
27
+ # one catalog (e.g., auto-retrieve src_parts[0]/dst_parts[0])
28
+ # copy the given namespace
29
+ src_namespace_name = src_parts[1]
30
+ dst_catalog_name = dst_parts[0]
31
+ dst_namespace_name = dst_parts[1] if len(dst_parts) >= 2 else src_namespace_name
32
+ new_namespace = dc.create_namespace(
33
+ namespace=dst_namespace_name,
34
+ properties=src_obj.properties,
35
+ catalog=dst_catalog_name,
36
+ )
37
+ return new_namespace
38
+ elif len(src_parts) == 3:
39
+ # copy the given table
40
+ raise NotImplementedError
41
+ elif len(src_parts) == 4:
42
+ # copy the given table version
43
+ raise NotImplementedError
44
+ elif len(src_parts) == 5:
45
+ # copy the given stream
46
+ raise NotImplementedError
47
+ elif len(src_parts) == 6:
48
+ # copy the given partition
49
+ raise NotImplementedError
50
+ elif len(src_parts) == 7:
51
+ # copy the given partition delta
52
+ raise NotImplementedError
53
+ raise ValueError(f"Invalid path: {src_parts}")
54
+
55
+
56
+ def concat(source, destination):
57
+ raise NotImplementedError
58
+
59
+
60
+ def delete(source):
61
+ raise NotImplementedError
62
+
63
+
64
+ def move(source, destination):
65
+ raise NotImplementedError
66
+
67
+
68
+ def list(path):
69
+ raise NotImplementedError
70
+
71
+
72
+ def get(path) -> Any:
73
+ parts = path.split("/")
74
+ parts = [part for part in parts if part]
75
+ if not dc.is_initialized():
76
+ # TODO(pdames): Re-initialize DeltaCAT with all catalogs from the
77
+ # last session.
78
+ raise ValueError("Catalog not initialized.")
79
+ if len(parts) == 1:
80
+ # TODO(pdames): Save all catalogs registered from the last session on
81
+ # disk so that users don't need to re-initialize them every time.
82
+ # get the given catalog
83
+ catalog_name = parts[0]
84
+ return dc.get_catalog(catalog_name)
85
+ elif len(parts) == 2:
86
+ # get the given namespace
87
+ catalog_name = parts[0]
88
+ namespace_name = parts[1]
89
+ return dc.get_namespace(
90
+ namespace=namespace_name,
91
+ catalog=catalog_name,
92
+ )
93
+ elif len(parts) == 3:
94
+ # get the given table
95
+ raise NotImplementedError
96
+ elif len(parts) == 4:
97
+ # get the given table version
98
+ raise NotImplementedError
99
+ elif len(parts) == 5:
100
+ # get the given stream
101
+ raise NotImplementedError
102
+ elif len(parts) == 6:
103
+ # get the given partition
104
+ raise NotImplementedError
105
+ elif len(parts) == 7:
106
+ # get the given partition delta
107
+ raise NotImplementedError
108
+ raise ValueError(f"Invalid path: {path}")
109
+
110
+
111
+ def put(path, *args, **kwargs) -> Any:
112
+ parts = path.split("/")
113
+ parts = [part for part in parts if part]
114
+ if len(parts) == 1:
115
+ # TODO(pdames): Save all catalogs registered from the last session on
116
+ # disk so that users don't need to re-initialize them every time.
117
+ # register the given catalog
118
+ catalog_name = parts[0]
119
+ # Initialize default catalog using kwargs
120
+ catalog = Catalog(**kwargs)
121
+ return dc.put_catalog(catalog_name, catalog)
122
+ elif len(parts) == 2:
123
+ # register the given namespace
124
+ catalog_name = parts[0]
125
+ namespace_name = parts[1]
126
+ if not dc.is_initialized():
127
+ # TODO(pdames): Re-initialize DeltaCAT with all catalogs from the
128
+ # last session.
129
+ raise ValueError("Catalog not initialized.")
130
+ new_namespace = dc.create_namespace(
131
+ namespace=namespace_name,
132
+ catalog=catalog_name,
133
+ *args,
134
+ **kwargs,
135
+ )
136
+ return new_namespace
137
+ elif len(parts) == 3:
138
+ # register the given table
139
+ raise NotImplementedError
140
+ elif len(parts) == 4:
141
+ # register the given table version
142
+ raise NotImplementedError
143
+ elif len(parts) == 5:
144
+ # register the given stream
145
+ raise NotImplementedError
146
+ elif len(parts) == 6:
147
+ # register the given partition
148
+ raise NotImplementedError
149
+ elif len(parts) == 7:
150
+ # register the given partition delta
151
+ raise NotImplementedError
152
+ raise ValueError(f"Invalid path: {path}")
153
+
154
+
155
+ def exists(path):
156
+ raise NotImplementedError
157
+
158
+
159
+ def query(path, expression):
160
+ raise NotImplementedError
161
+
162
+
163
+ def tail(path):
164
+ raise NotImplementedError
165
+
166
+
167
+ def head(path):
168
+ raise NotImplementedError
deltacat/aws/s3u.py CHANGED
@@ -14,7 +14,7 @@ from deltacat.aws.constants import (
14
14
  DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY,
15
15
  )
16
16
 
17
- import pyarrow as pa
17
+ import pyarrow.fs
18
18
  import ray
19
19
  import s3fs
20
20
  from boto3.resources.base import ServiceResource
@@ -134,7 +134,7 @@ class UuidBlockWritePathProvider(FilenameProvider):
134
134
  self,
135
135
  base_path: str,
136
136
  *,
137
- filesystem: Optional[pa.filesystem.FileSystem] = None,
137
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
138
138
  dataset_uuid: Optional[str] = None,
139
139
  block: Optional[ObjectRef[Block]] = None,
140
140
  block_index: Optional[int] = None,
@@ -150,7 +150,7 @@ class UuidBlockWritePathProvider(FilenameProvider):
150
150
  self,
151
151
  base_path: str,
152
152
  *,
153
- filesystem: Optional[pa.filesystem.FileSystem] = None,
153
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
154
154
  dataset_uuid: Optional[str] = None,
155
155
  block: Optional[ObjectRef[Block]] = None,
156
156
  block_index: Optional[int] = None,
@@ -364,7 +364,7 @@ def upload_table(
364
364
  **s3_client_kwargs,
365
365
  ) -> ManifestEntryList:
366
366
  """
367
- Writes the given table to 1 or more S3 files and return Redshift
367
+ Writes the given table to 1 or more S3 files and return
368
368
  manifest entries describing the uploaded files.
369
369
  """
370
370
  if s3_table_writer_kwargs is None:
@@ -0,0 +1,82 @@
1
+ import sys
2
+ import time
3
+ from contextlib import contextmanager
4
+ from typing import Generator, Tuple
5
+
6
+ from deltacat.benchmarking.benchmark_report import BenchmarkMetric, BenchmarkStep
7
+ from deltacat.storage.rivulet.dataset import Dataset
8
+ from deltacat.storage.rivulet.reader.query_expression import QueryExpression
9
+
10
+
11
+ @contextmanager
12
+ def timed_step(description: str) -> Generator[BenchmarkStep, None, None]:
13
+ """Convenience for computing elapsed time of a block of code as a metric.
14
+
15
+ :param description: description of the step
16
+ :return: a benchmark operation populated with the elapsed time
17
+ """
18
+ metric = BenchmarkStep(description)
19
+ start_time = time.time()
20
+ yield metric
21
+ end_time = time.time()
22
+ metric.add(BenchmarkMetric("elapsed_time", 1000 * (end_time - start_time), "ms"))
23
+
24
+
25
+ class BenchmarkEngine:
26
+ def __init__(self, dataset: Dataset):
27
+ self.dataset = dataset
28
+
29
+ def load_and_commit(
30
+ self, schema_name, generator, count
31
+ ) -> Tuple[str, BenchmarkStep]:
32
+ """Load count number of rows from the generator and commit.
33
+
34
+ :param generator: row generator
35
+ :param count: the number of rows to load into the dataset
36
+ :return: tuple of the manifest URI and a operation measurement
37
+ """
38
+ desc = f"load {count} from {generator}"
39
+ writer = self.dataset.writer(schema_name)
40
+ with timed_step(desc) as step:
41
+ rows = [generator.generate() for _ in range(count)]
42
+ writer.write(rows)
43
+ result = writer.flush()
44
+ step.add(BenchmarkMetric("loaded", count))
45
+ return result, step
46
+
47
+ def scan(self) -> Tuple[set[any], BenchmarkStep]:
48
+ """
49
+ Scans the rows in dataset and prints some basic statistics about the manifest
50
+
51
+ :return: Tuple[set[any], BenchmarkStep] - a tuple containing a set of merge keys and a benchmark step with metrics
52
+ """
53
+ keys = set()
54
+ object_count = 0
55
+ size_b = 0
56
+ # Note that we expect single col merge keys so we can return key set
57
+ # this will fail with validation error if dataset has multiple merge keys
58
+ merge_key_name = self.dataset.schemas["all"].get_merge_key()
59
+ with timed_step("full scan") as step:
60
+ for row in self.dataset.scan(QueryExpression()).to_pydict():
61
+ object_count += 1
62
+ size_b += sum([sys.getsizeof(x) for x in row.values()])
63
+ keys.add(row.get(merge_key_name))
64
+ # TODO replace with the actual metrics we want to measure
65
+ step.add(BenchmarkMetric("rows read", object_count))
66
+ step.add(BenchmarkMetric("size", size_b / (1024 * 1024), "MB"))
67
+ return keys, step
68
+
69
+ def run_queries(
70
+ self, description, manifest_uri, queries: list[QueryExpression]
71
+ ) -> BenchmarkStep:
72
+ object_count = 0
73
+ size_b = 0
74
+ with timed_step(description) as step:
75
+ for query in queries:
76
+ for row in self.dataset.scan(query).to_pydict():
77
+ object_count += 1
78
+ size_b += sum([sys.getsizeof(x) for x in row.values()])
79
+ # TODO replace with the actual metrics we want to measure
80
+ step.add(BenchmarkMetric("rows read", object_count))
81
+ step.add(BenchmarkMetric("size", size_b / (1024 * 1024), "MB"))
82
+ return step
@@ -0,0 +1,86 @@
1
+ from dataclasses import dataclass
2
+ from tabulate import tabulate
3
+ from typing import Union, Optional
4
+
5
+
6
+ @dataclass
7
+ class BenchmarkMetric:
8
+ name: str
9
+ value: Union[float, int]
10
+ unit: Optional[str] = None
11
+
12
+
13
+ class BenchmarkStep:
14
+ """Captures measurements from a given operation"""
15
+
16
+ def __init__(self, description):
17
+ self.description: str = description
18
+ """Description of the operation"""
19
+ self._metrics: dict[str, BenchmarkMetric] = {}
20
+ """Description of the operation"""
21
+
22
+ def add(self, metric: BenchmarkMetric):
23
+ self._metrics[metric.name] = metric
24
+
25
+ def list_metrics(self):
26
+ """List the metrics (sorted by name)"""
27
+ return sorted(self._metrics.values(), key=lambda x: x.name)
28
+
29
+
30
+ class BenchmarkRun:
31
+ """Class for capturing measurements for a given test suite for comparison."""
32
+
33
+ def __init__(self, suite: str, description: str):
34
+ self.suite = suite
35
+ """The test suite associated with this report."""
36
+ self.description = description
37
+ """Description of the report"""
38
+ self.steps: list[BenchmarkStep] = []
39
+ """List of steps and their metrics"""
40
+
41
+ def add(self, operation):
42
+ self.steps.append(operation)
43
+
44
+
45
+ class BenchmarkReport:
46
+ def __init__(self, name):
47
+ self.name = name
48
+ self.runs: list[BenchmarkRun] = []
49
+
50
+ def add(self, run):
51
+ self.runs.append(run)
52
+
53
+ def __str__(self):
54
+ """Pretty-print a table that compares the metrics across each report.
55
+
56
+ We want to transpose these such that each report gets their own column and each metric gets its own row
57
+ (ideally grouped by operation).
58
+ """
59
+ if not self.runs:
60
+ print("No runs to compare!")
61
+ return
62
+ suites = set(r.suite for r in self.runs)
63
+ if len(suites) > 1:
64
+ print("Found more than one type of suite")
65
+ return
66
+ suite = self.runs[0].suite
67
+
68
+ headers = [
69
+ f"{suite} Operation",
70
+ "Metric",
71
+ "Unit",
72
+ *[r.description for r in self.runs],
73
+ ]
74
+ rows = []
75
+ for step_tranche in zip(*[r.steps for r in self.runs]):
76
+ # TODO zip by metric name instead of assuming all metrics are being measured
77
+ step_name = step_tranche[0].description
78
+ for metric_tuple in zip(*[x.list_metrics() for x in step_tranche]):
79
+ row = [
80
+ step_name,
81
+ metric_tuple[0].name,
82
+ metric_tuple[0].unit,
83
+ *[p.value for p in metric_tuple],
84
+ ]
85
+ rows.append(row)
86
+ return tabulate(rows, headers=headers, tablefmt="fancy_outline")
@@ -0,0 +1,11 @@
1
+ from typing import Protocol
2
+
3
+ from deltacat.benchmarking.benchmark_report import BenchmarkRun
4
+
5
+
6
+ class BenchmarkSuite(Protocol):
7
+ def run(self) -> BenchmarkRun:
8
+ """Run the benchmark suite and produce a report.
9
+
10
+ Each report should be comparable against other reports by the same suite"""
11
+ ...
@@ -4,7 +4,9 @@ import pyarrow as pa
4
4
  import pyarrow.fs as pafs
5
5
  import pyarrow.parquet as papq
6
6
  import pytest
7
+ from _pytest.terminal import TerminalReporter
7
8
 
9
+ from deltacat.benchmarking.benchmark_report import BenchmarkReport
8
10
  from deltacat.utils.pyarrow import s3_file_to_table
9
11
  from deltacat.types.media import (
10
12
  ContentEncoding,
@@ -12,6 +14,25 @@ from deltacat.types.media import (
12
14
  )
13
15
 
14
16
 
17
+ @pytest.fixture(autouse=True, scope="function")
18
+ def report(request):
19
+ report = BenchmarkReport(request.node.name)
20
+
21
+ def final_callback():
22
+ terminal_reporter: TerminalReporter = request.config.pluginmanager.get_plugin(
23
+ "terminalreporter"
24
+ )
25
+ capture_manager = request.config.pluginmanager.get_plugin("capturemanager")
26
+ with capture_manager.global_and_fixture_disabled():
27
+ terminal_reporter.ensure_newline()
28
+ terminal_reporter.section(request.node.name, sep="-", blue=True, bold=True)
29
+ terminal_reporter.write(str(report))
30
+ terminal_reporter.ensure_newline()
31
+
32
+ request.addfinalizer(final_callback)
33
+ return report
34
+
35
+
15
36
  def pyarrow_read(path: str, columns: list[str] | None = None) -> pa.Table:
16
37
  assert path.startswith(
17
38
  "s3://"
@@ -0,0 +1,94 @@
1
+ import math
2
+ import os
3
+ from enum import Enum
4
+ from io import BytesIO
5
+ from typing import Any, Dict
6
+
7
+ import faker
8
+ from faker_file.providers.png_file import GraphicPngFileProvider
9
+ from faker_file.storages.filesystem import FileSystemStorage
10
+ from PIL import Image
11
+
12
+ from deltacat.benchmarking.data.row_generator import RowGenerator
13
+
14
+
15
+ class ImageStyle(Enum):
16
+ RANDOM_BYTES = 1
17
+ """Generate random bytes to simulate an image.
18
+
19
+ This is the fastest option (if you want to test correctness).
20
+ """
21
+ PILLOW = 2
22
+ """Generate actual PNG files in-memory directly using Pillow"""
23
+ FAKER_FILE = 3
24
+ """Generate PNG files on-disk using with some random elements"""
25
+
26
+
27
+ class RandomRowGenerator(RowGenerator):
28
+ """Generate rows with 'images' that are just randomly-generated bytes"""
29
+
30
+ def __init__(
31
+ self, seed=0, tmp_dir=None, style: ImageStyle = ImageStyle.RANDOM_BYTES
32
+ ):
33
+ self.seed = seed
34
+ self.fake = faker.Faker()
35
+ self.fake.seed_instance(seed)
36
+ self.fake.add_provider(GraphicPngFileProvider)
37
+ self.temp_dir = tmp_dir
38
+ self.style = style
39
+
40
+ def __str__(self):
41
+ return f"random source"
42
+
43
+ def _generate_image(self, width, height) -> bytes:
44
+ if self.style == ImageStyle.RANDOM_BYTES:
45
+ return self._generate_with_random_bytes(width, height)
46
+ elif self.style == ImageStyle.PILLOW:
47
+ return self._generate_with_pillow(width, height)
48
+ elif self.style == ImageStyle.FAKER_FILE:
49
+ return self._generate_with_faker(width, height)
50
+ else:
51
+ raise ValueError("Unknown ImageStyle")
52
+
53
+ @staticmethod
54
+ def _generate_with_random_bytes(width, height) -> bytes:
55
+ """Generate random bytes to simulate an image."""
56
+ target_size = math.floor(
57
+ width * height / 50
58
+ ) # this isn't actually how file size relates to image size
59
+ # Assumption: we don't actually need images. It suffices to generate arbitrary-length bytes of random characters.
60
+ return os.urandom(target_size)
61
+
62
+ @staticmethod
63
+ def _generate_with_pillow(width, height) -> bytes:
64
+ """Generate actual PNG files in-memory directly using Pillow"""
65
+ file = BytesIO()
66
+ image = Image.new("RGBA", size=(width, height), color=(155, 0, 0))
67
+ image.save(file, "png")
68
+ file.name = "test.png"
69
+ file.seek(0)
70
+ return file.read()
71
+
72
+ def _generate_with_faker(self, width, height) -> bytes:
73
+ """Generate PNG files on-disk using with some random elements"""
74
+ rel_name = self.fake.graphic_png_file(
75
+ storage=FileSystemStorage(
76
+ root_path=self.temp_dir,
77
+ rel_path="tmp",
78
+ ),
79
+ size=(width, height),
80
+ )
81
+ file_name = f"{self.temp_dir}/{rel_name}"
82
+ with open(file_name, "rb") as f:
83
+ return f.read()
84
+
85
+ def generate(self) -> Dict[str, Any]:
86
+ return {
87
+ "id": self.fake.random_int(0, 10_000_000),
88
+ "source": self.fake.image_url(),
89
+ "media": (
90
+ self._generate_image(
91
+ self.fake.random_int(512, 2048), self.fake.random_int(512, 4096)
92
+ )
93
+ ),
94
+ }
@@ -0,0 +1,10 @@
1
+ from typing import Protocol, Iterator, Dict, Any
2
+
3
+
4
+ class RowGenerator(Protocol):
5
+ def generate(self) -> Dict[str, Any]:
6
+ ...
7
+
8
+ def generate_dataset(self, count) -> Iterator[Dict[str, Any]]:
9
+ """Generate a dataset with a given number of records"""
10
+ return map(lambda x: self.generate(), iter(range(count)))